Skip to content

Instantly share code, notes, and snippets.

Created April 22, 2014 13:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexhanna/11178332 to your computer and use it in GitHub Desktop.
Save alexhanna/11178332 to your computer and use it in GitHub Desktop.
Gist for generating sentiment scores for political tweets from the gardenhose and a focused sample
from __future__ import division
import csv, logging, math, os.path
import pickle, random, re, string
import time
import numpy as np
import pandas as pd
from nltk.tokenize.regexp import WordPunctTokenizer
def repRT(row):
if not pd.isnull(row['rt-text']):
return row['rt-text']
return row['text']
def sentiment_score(text):
text = text.translate(string.maketrans("",""), string.punctuation)
words = set(toke.tokenize(text))
if not len(words):
return 0
pos = list(pos_words & words)
neg = list(neg_words & words)
return (len(pos) - len(neg)) / len(words)
def toMin(x):
x = time.strptime(x, '%Y-%m-%d %H:%M:%S')
return time.strftime('%Y-%m-%d %H:%M:00', x)
## positive and negative words from
## cite the following:
# Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews."
# Proceedings of the ACM SIGKDD International Conference on Knowledge
# Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle,
# Washington, USA,
# Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing
# and Comparing Opinions on the Web." Proceedings of the 14th
# International World Wide Web conference (WWW-2005), May 10-14,
# 2005, Chiba, Japan.
toke = WordPunctTokenizer()
pos_words = set(open("../data/positive.txt", "r").read().split("\n"))
neg_words = set(open("../data/negative.txt", "r").read().split("\n"))
##### gardenhose
gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name", "user-userlevel",
"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name", "rt-user-userlevel"]
## load tweets for gardenhose
df = pd.read_csv("/project/hanna/elex2012/gh.20121003-usprez.csv",
sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols)
## move RT in main text because of convenience
df['text'] = df.apply(repRT, axis = 1)
## lowercase
df['text'] = df['text'].apply(str.lower)
df['obama'] = pd.Series(0)
df['romney'] = pd.Series(0)
## Index tweets that mention only Obama or Romney
df['obama'] = df['text'].apply(lambda x: 1 if 'obama' in x and 'romney' not in x else 0)
df['romney'] = df['text'].apply(lambda x: 1 if 'obama' not in x and 'romney' in x else 0)
df['score'] = df.text.apply(sentiment_score)
df['date'] = df['created_at'].apply(lambda x: toMin(x))
grouped = df.loc[df['obama'] == 1].groupby('date')
oscores = grouped['score'].agg([np.mean, np.std])
oscores['person'] = 'obama'
grouped = df.loc[df['romney'] == 1].groupby('date')
rscores = grouped['score'].agg([np.mean, np.std])
rscores['person'] = 'romney'
scores = oscores.append(rscores)
##### elex2012
fs_cols = ["id_str", "created_at", "text", "user-id_str", "user-screen_name",
"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-screen_name"]
fs = pd.read_csv("/project/hanna/elex2012/elex2012.20121003.csv",
sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = fs_cols)
ul = pd.read_csv("/home/a/ahanna/sandbox/hadoop/streaming/data/follow-all.txt",
sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = ['user-id_str', 'user-level'])
fs = fs.merge(ul)
## move RT in main text because of convenience
fs['text'] = fs.apply(repRT, axis = 1)
## lowercase
fs['text'] = fs['text'].apply(str.lower)
fs['obama'] = pd.Series(0)
fs['romney'] = pd.Series(0)
## Index tweets that mention only Obama or Romney
fs['obama'] = fs['text'].apply(lambda x: 1 if 'obama' in x and 'romney' not in x else 0)
fs['romney'] = fs['text'].apply(lambda x: 1 if 'obama' not in x and 'romney' in x else 0)
fs['score'] = fs.text.apply(sentiment_score)
fs['date'] = fs['created_at'].apply(lambda x: toMin(x))
grouped = fs.loc[fs['obama'] == 1].groupby(['date', 'user-level'])
oscores = grouped['score'].agg([np.mean, np.std])
oscores['person'] = 'obama'
grouped = fs.loc[fs['romney'] == 1].groupby(['date', 'user-level'])
rscores = grouped['score'].agg([np.mean, np.std])
rscores['person'] = 'romney'
scores = oscores.append(rscores)
# scores = pd.DataFrame({
# 'created_at': fs['created_at'],
# 'user_level': fs['user-level'],
# 'obama': fs['obama'],
# 'romney': fs['romney'],
# 'score': fs['score']
# })
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment