alexhanna/sentimentClassifier.py

## sentimentClassifier.py
from __future__ import division

import csv, logging, math, os.path
import pickle, random, re, string
import datetime, time
import numpy as np
import pandas as pd
import scipy as sp

## metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.utils.extmath import density

## vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

## CV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import ShuffleSplit
from sklearn.cross_validation import cross_val_score

## Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import nltk.data
from nltk.tokenize.regexp import WordPunctTokenizer

###########
##### Generating training data
###########

def generateTraining(df, filename):
    val = read.csv(filename)

    ## Classification
    ## grab random sample of 500 tweets for each candidate
    idx  = []
    idx += random.sample(df[df['obama']  == 1].index, 500)
    idx += random.sample(df[df['romney'] == 1].index, 500)

    ## randomize the index
    np.random.shuffle(idx)

    ## get the tweets
    sub = df.ix[idx]

    ## write to disk
    sub.to_csv("../data/" + debate + "-valence-validation.csv", index = True)

def repRT(row):
	if not pd.isnull(row['rt-text']):
		return row['rt-text']
	else:
		return row['text']

def determineShot(dt, lag = 0, vlag = 0):
	## subtract an hour and add lag
	dt    = dt - datetime.timedelta(0, 3600 + vlag + lag)
	shot  = tldf[(tldf['Start'] <= dt) & (tldf['End'] >= dt)]

	if shot:
		return shot.index.values[0]
	else:
		return None

## debate number
debate = 'usprez3'

###############################################################################
##### reaction data
###############################################################################

## debate dates:
## 1  - 2012-10-04 (04 in GMT)
## VP - 2012-10-11 (12 in GMT)
## 2  - 2012-10-16 (17 in GMT)
## 3  - 2012-10-22 (23 in GMT)

date = ''
if debate == 'usprez1':
	sfile = "../data/Debate1-biobehavioral.csv"
	date  = '04'
elif debate == 'usprez3':
	sfile = "../data/Debate3-biobehavioral.csv"
	date  = '23'

shotdf = pd.read_csv(sfile, index_col = 2)

# Dataframe for matching up shots
tldf = pd.DataFrame({
	'Start': shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[0], "%Y-%m-%d %H:%M:%S")),
	'End':   shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[1], "%Y-%m-%d %H:%M:%S"))
})

###############################################################################
##### sentiment classifier
###############################################################################

def benchmark(clf, X, y, feature_names = []):
    pred = clf.predict(X)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 20 keywords per class:")
            nCat  = len(categories)
            nCoef = clf.coef_.shape[0]

            if nCat > 2:
                if nCoef == nCat:
                    for i, category in enumerate(categories):
                        top20 = np.argsort(clf.coef_[i])[-20:]
                        print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20]))))
            else:
                category = categories[1]
                top20 = np.argsort(clf.coef_[0])[-20:]
                print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20]))))

    print("classification report:")
    if len(feature_names) > 0:
        print(metrics.classification_report(y, pred, labels = labels, target_names = categories))
    else:
        print(metrics.classification_report(y, pred))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y, pred))

    return pred

tdf        = pd.read_csv('../data/' + debate + '-valence-validation-coderall.csv')
tdf['y']   = tdf.sentiment.apply(lambda x: int(x))
tdf        = tdf[tdf.y != 0]

## trying to replace words associated with a particular candidate
## with a generic
tdf.text = tdf.text.apply(lambda x: re.sub(r"obama|romney(ryan)?", "lname", x, flags = re.I))
tdf.text = tdf.text.apply(lambda x: re.sub(r"mitt|barack", "fname", x, flags = re.I))

## sample with balanced classes
train_idx = random.sample(tdf.index, int(tdf.shape[0]*0.8))
train_df  = tdf.ix[train_idx]

## reset index for CV
train_df  = train_df.reset_index()
cv        = StratifiedKFold(train_df.y, n_folds = 3)

## save for last test
final_df   = tdf.drop(train_idx)

labels     = [-1, 1]
categories = ['negative', 'positive']

results    = {'f1':[], 'p': [], 'r': [], 'c': [], 'cn': [], 'pen': []}
for train, test in cv:
	vectorizer = TfidfVectorizer(sublinear_tf=True, max_df = 0.5, ngram_range = (1,2), stop_words='english')

	X_train  = vectorizer.fit_transform(list(train_df.ix[train].text))
	X_test   = vectorizer.transform(list(train_df.ix[test].text))
	features = np.asarray(vectorizer.get_feature_names())

	y_train  = train_df.ix[train].y
	y_test   = train_df.ix[test].y

	clf = LogisticRegression(class_weight="auto", C = 0.01)
	clf.fit(X_train, y_train)
	pred = benchmark(clf, X_test, y_test, features)

## best for this dataset
clf     = LogisticRegression(class_weight="auto", C = 10**-2)
X_final = vectorizer.transform(list(final_df.text))
pred    = benchmark(clf, X_final, final_df.y, features)

###############################################################################
##### gardenhose
###############################################################################

## all debates ran 9-10:30 PM EST
## http://www.huffingtonpost.com/2012/10/02/presidential-debate-schedule-2012_n_1931082.html

if debate == 'usprez1':
	gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name", "user-userlevel",
		       "rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name", "rt-user-userlevel"]
	df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121003-usprez.csv",
		sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1])
	startDt = datetime.datetime(2012, 10, 04, 1, 0, 0)
	endDt   = datetime.datetime(2012, 10, 04, 2, 35, 0)
	lag     = 90
elif debate == 'usprez3':
	gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name",
	           "rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name"]
	df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121022-usprez3.csv",
		sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1],
		na_values = ["\N"], keep_default_na = True, error_bad_lines = False)
	startDt = datetime.datetime(2012, 10, 23, 1, 0, 0)
	endDt   = datetime.datetime(2012, 10, 23, 2, 35, 0)
	lag     = 120

## filter to debate, sort
df = df.loc[(df['created_at'] >= startDt) & (df['created_at'] <= endDt)]
df = df.sort('created_at')

## move RT in main text because of convenience
df['text'] = df.apply(repRT, axis = 1)

## lowercase
df['text'] = df['text'].apply(str.lower)

############
##### Memes
############

## Debate 3 memes
# O: "The 1980's are now calling to ask for their foreign policy back...":
# Start: 00:09:12
# FT: 00:11:10

#df['1980']   = df['text'].apply(lambda x: 1 if '1980' in x else 0)

# R: "Attacking me is not an agenda..."
# Start: 00:11:15
# FT: 01:12:53

#df['attack'] = df['text'].apply(lambda x: 1 if 'attacking me' in x else 0)

# O: "Well, governor, we also have fewer horses and bayonets, because the nature of our military's changed..."
# Start: 00:42:19
# FT: 00:44:04
#df['hnb']    = df['text'].apply(lambda x: 1 if 'horses and bayonets' in x else 0)

# Romney's "I love teachers...":
# Start: 01:26:24
# FT: 01:29:00

#df['teach']  = df['text'].apply(lambda x: 1 if 'i love teachers' in x else 0)

## I'm thinking a two minute lag will be about right for debate 3

###############
##### end memes
###############

## Index tweets that mention only Obama or Romney
df['obama']  = df['text'].apply(lambda x: 1 if 'obama' in x and 'romney' not in x else 0)
df['romney'] = df['text'].apply(lambda x: 1 if 'obama' not in x and 'romney' in x else 0)

## then replace candidate mentions with lname/fname
## NOTE: only do this in the classifying step
df['text'] = df['text'].apply(lambda x: re.sub(r"obama|romney(ryan)?", "lname", x, flags = re.I))
df['text'] = df['text'].apply(lambda x: re.sub(r"mitt|barack", "fname", x, flags = re.I))

## vectorize text and produce sentiment vector
df['score'] = clf.predict(vectorizer.transform(df['text']))

## bin each tweet by shot
df['Shot']   = df['created_at'].apply(determineShot, args = [0,  lag])
#df['Shot15'] = df['created_at'].apply(determineShot, args = [15, lag])
#df['Shot30'] = df['created_at'].apply(determineShot, args = [30, lag])
#df['Shot45'] = df['created_at'].apply(determineShot, args = [45, lag])

## assign candidate scores
df['O_score'] = df.apply(lambda x: x['score'] if x['obama'] else None, axis = 1)
df['R_score'] = df.apply(lambda x: x['score'] if x['romney'] else None, axis = 1)

############
## Volume and sentiment. No need to generate these for the
## regression analysis dataset
############

##### volume by minute
df['date'] = np.array(df['created_at'], dtype="datetime64[m]")
grouped = df.groupby('date')

ovol = grouped['obama'].agg([np.sum])
rvol = grouped['romney'].agg([np.sum])

ovol['person'] = 'Obama'
rvol['person'] = 'Romney'

out = ovol.append(rvol)
out.to_csv("../data/gh.%s-volume.csv" % debate)

##### sentiment by minute
osent = grouped['O_score'].agg([np.mean, np.std])
rsent = grouped['R_score'].agg([np.mean, np.std])

osent['person'] = 'Obama'
rsent['person'] = 'Romney'

out = osent.append(rsent)
out.to_csv("../data/gh.%s-sentiment.csv" % debate)

## Group by different shots with lags and volume
for s in ['Shot']: #, 'Shot15', 'Shot30', 'Shot45']:
	grouped = df.groupby(s)
	oscore  = grouped['O_score'].agg([np.mean])
	oscore.columns = ['GH_Osentiment_' + s]

	ovol    = grouped['obama'].agg([np.sum])
	ovol.columns = ['GH_Ovolume_' + s]

	rscore = grouped['R_score'].agg([np.mean])
	rscore.columns = ['GH_Rsentiment_' + s]

	rvol    = grouped['romney'].agg([np.sum])
	rvol.columns = ['GH_Rvolume_' + s]

	## join them all together
	shotdf = shotdf.merge(oscore, left_index = True, right_index = True)
	shotdf = shotdf.merge(ovol, left_index = True, right_index = True)
	shotdf = shotdf.merge(rscore, left_index = True, right_index = True)
	shotdf = shotdf.merge(rvol, left_index = True, right_index = True)

shotdf.to_csv("../data/" + debate + "-biobehavioral-twitterstats.csv")
	from __future__ import division

	import csv, logging, math, os.path
	import pickle, random, re, string
	import datetime, time
	import numpy as np
	import pandas as pd
	import scipy as sp

	## metrics
	from sklearn import metrics
	from sklearn.metrics import accuracy_score
	from sklearn.metrics import f1_score
	from sklearn.utils.extmath import density

	## vectorizers
	from sklearn.feature_extraction.text import TfidfVectorizer

	## CV
	from sklearn.cross_validation import StratifiedKFold
	from sklearn.cross_validation import ShuffleSplit
	from sklearn.cross_validation import cross_val_score

	## Classifiers
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import LinearSVC
	from sklearn.svm import SVC
	from sklearn.neighbors import KNeighborsClassifier

	import nltk.data
	from nltk.tokenize.regexp import WordPunctTokenizer

	###########
	##### Generating training data
	###########

	def generateTraining(df, filename):
	val = read.csv(filename)

	## Classification
	## grab random sample of 500 tweets for each candidate
	idx = []
	idx += random.sample(df[df['obama'] == 1].index, 500)
	idx += random.sample(df[df['romney'] == 1].index, 500)

	## randomize the index
	np.random.shuffle(idx)

	## get the tweets
	sub = df.ix[idx]

	## write to disk
	sub.to_csv("../data/" + debate + "-valence-validation.csv", index = True)

	def repRT(row):
	if not pd.isnull(row['rt-text']):
	return row['rt-text']
	else:
	return row['text']

	def determineShot(dt, lag = 0, vlag = 0):
	## subtract an hour and add lag
	dt = dt - datetime.timedelta(0, 3600 + vlag + lag)
	shot = tldf[(tldf['Start'] <= dt) & (tldf['End'] >= dt)]

	if shot:
	return shot.index.values[0]
	else:
	return None

	## debate number
	debate = 'usprez3'

	###############################################################################
	##### reaction data
	###############################################################################

	## debate dates:
	## 1 - 2012-10-04 (04 in GMT)
	## VP - 2012-10-11 (12 in GMT)
	## 2 - 2012-10-16 (17 in GMT)
	## 3 - 2012-10-22 (23 in GMT)

	date = ''
	if debate == 'usprez1':
	sfile = "../data/Debate1-biobehavioral.csv"
	date = '04'
	elif debate == 'usprez3':
	sfile = "../data/Debate3-biobehavioral.csv"
	date = '23'

	shotdf = pd.read_csv(sfile, index_col = 2)

	# Dataframe for matching up shots
	tldf = pd.DataFrame({
	'Start': shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[0], "%Y-%m-%d %H:%M:%S")),
	'End': shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[1], "%Y-%m-%d %H:%M:%S"))
	})

	###############################################################################
	##### sentiment classifier
	###############################################################################

	def benchmark(clf, X, y, feature_names = []):
	pred = clf.predict(X)

	if hasattr(clf, 'coef_'):
	print("dimensionality: %d" % clf.coef_.shape[1])
	print("density: %f" % density(clf.coef_))

	if feature_names is not None:
	print("top 20 keywords per class:")
	nCat = len(categories)
	nCoef = clf.coef_.shape[0]

	if nCat > 2:
	if nCoef == nCat:
	for i, category in enumerate(categories):
	top20 = np.argsort(clf.coef_[i])[-20:]
	print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20]))))
	else:
	category = categories[1]
	top20 = np.argsort(clf.coef_[0])[-20:]
	print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20]))))

	print("classification report:")
	if len(feature_names) > 0:
	print(metrics.classification_report(y, pred, labels = labels, target_names = categories))
	else:
	print(metrics.classification_report(y, pred))

	print("confusion matrix:")
	print(metrics.confusion_matrix(y, pred))

	return pred

	tdf = pd.read_csv('../data/' + debate + '-valence-validation-coderall.csv')
	tdf['y'] = tdf.sentiment.apply(lambda x: int(x))
	tdf = tdf[tdf.y != 0]

	## trying to replace words associated with a particular candidate
	## with a generic
	tdf.text = tdf.text.apply(lambda x: re.sub(r"obama\|romney(ryan)?", "lname", x, flags = re.I))
	tdf.text = tdf.text.apply(lambda x: re.sub(r"mitt\|barack", "fname", x, flags = re.I))

	## sample with balanced classes
	train_idx = random.sample(tdf.index, int(tdf.shape[0]*0.8))
	train_df = tdf.ix[train_idx]

	## reset index for CV
	train_df = train_df.reset_index()
	cv = StratifiedKFold(train_df.y, n_folds = 3)

	## save for last test
	final_df = tdf.drop(train_idx)

	labels = [-1, 1]
	categories = ['negative', 'positive']

	results = {'f1':[], 'p': [], 'r': [], 'c': [], 'cn': [], 'pen': []}
	for train, test in cv:
	vectorizer = TfidfVectorizer(sublinear_tf=True, max_df = 0.5, ngram_range = (1,2), stop_words='english')

	X_train = vectorizer.fit_transform(list(train_df.ix[train].text))
	X_test = vectorizer.transform(list(train_df.ix[test].text))
	features = np.asarray(vectorizer.get_feature_names())

	y_train = train_df.ix[train].y
	y_test = train_df.ix[test].y

	clf = LogisticRegression(class_weight="auto", C = 0.01)
	clf.fit(X_train, y_train)
	pred = benchmark(clf, X_test, y_test, features)

	## best for this dataset
	clf = LogisticRegression(class_weight="auto", C = 10**-2)
	X_final = vectorizer.transform(list(final_df.text))
	pred = benchmark(clf, X_final, final_df.y, features)

	###############################################################################
	##### gardenhose
	###############################################################################

	## all debates ran 9-10:30 PM EST
	## http://www.huffingtonpost.com/2012/10/02/presidential-debate-schedule-2012_n_1931082.html

	if debate == 'usprez1':
	gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name", "user-userlevel",
	"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name", "rt-user-userlevel"]
	df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121003-usprez.csv",
	sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1])
	startDt = datetime.datetime(2012, 10, 04, 1, 0, 0)
	endDt = datetime.datetime(2012, 10, 04, 2, 35, 0)
	lag = 90
	elif debate == 'usprez3':
	gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name",
	"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name"]
	df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121022-usprez3.csv",
	sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1],
	na_values = ["\N"], keep_default_na = True, error_bad_lines = False)
	startDt = datetime.datetime(2012, 10, 23, 1, 0, 0)
	endDt = datetime.datetime(2012, 10, 23, 2, 35, 0)
	lag = 120

	## filter to debate, sort
	df = df.loc[(df['created_at'] >= startDt) & (df['created_at'] <= endDt)]
	df = df.sort('created_at')

	## move RT in main text because of convenience
	df['text'] = df.apply(repRT, axis = 1)

	## lowercase
	df['text'] = df['text'].apply(str.lower)

	############
	##### Memes
	############

	## Debate 3 memes
	# O: "The 1980's are now calling to ask for their foreign policy back...":
	# Start: 00:09:12
	# FT: 00:11:10

	#df['1980'] = df['text'].apply(lambda x: 1 if '1980' in x else 0)

	# R: "Attacking me is not an agenda..."
	# Start: 00:11:15
	# FT: 01:12:53

	#df['attack'] = df['text'].apply(lambda x: 1 if 'attacking me' in x else 0)

	# O: "Well, governor, we also have fewer horses and bayonets, because the nature of our military's changed..."
	# Start: 00:42:19
	# FT: 00:44:04
	#df['hnb'] = df['text'].apply(lambda x: 1 if 'horses and bayonets' in x else 0)

	# Romney's "I love teachers...":
	# Start: 01:26:24
	# FT: 01:29:00

	#df['teach'] = df['text'].apply(lambda x: 1 if 'i love teachers' in x else 0)

	## I'm thinking a two minute lag will be about right for debate 3

	###############
	##### end memes
	###############

	## Index tweets that mention only Obama or Romney
	df['obama'] = df['text'].apply(lambda x: 1 if 'obama' in x and 'romney' not in x else 0)
	df['romney'] = df['text'].apply(lambda x: 1 if 'obama' not in x and 'romney' in x else 0)

	## then replace candidate mentions with lname/fname
	## NOTE: only do this in the classifying step
	df['text'] = df['text'].apply(lambda x: re.sub(r"obama\|romney(ryan)?", "lname", x, flags = re.I))
	df['text'] = df['text'].apply(lambda x: re.sub(r"mitt\|barack", "fname", x, flags = re.I))

	## vectorize text and produce sentiment vector
	df['score'] = clf.predict(vectorizer.transform(df['text']))

	## bin each tweet by shot
	df['Shot'] = df['created_at'].apply(determineShot, args = [0, lag])
	#df['Shot15'] = df['created_at'].apply(determineShot, args = [15, lag])
	#df['Shot30'] = df['created_at'].apply(determineShot, args = [30, lag])
	#df['Shot45'] = df['created_at'].apply(determineShot, args = [45, lag])

	## assign candidate scores
	df['O_score'] = df.apply(lambda x: x['score'] if x['obama'] else None, axis = 1)
	df['R_score'] = df.apply(lambda x: x['score'] if x['romney'] else None, axis = 1)

	############
	## Volume and sentiment. No need to generate these for the
	## regression analysis dataset
	############

	##### volume by minute
	df['date'] = np.array(df['created_at'], dtype="datetime64[m]")
	grouped = df.groupby('date')

	ovol = grouped['obama'].agg([np.sum])
	rvol = grouped['romney'].agg([np.sum])

	ovol['person'] = 'Obama'
	rvol['person'] = 'Romney'

	out = ovol.append(rvol)
	out.to_csv("../data/gh.%s-volume.csv" % debate)

	##### sentiment by minute
	osent = grouped['O_score'].agg([np.mean, np.std])
	rsent = grouped['R_score'].agg([np.mean, np.std])

	osent['person'] = 'Obama'
	rsent['person'] = 'Romney'

	out = osent.append(rsent)
	out.to_csv("../data/gh.%s-sentiment.csv" % debate)

	## Group by different shots with lags and volume
	for s in ['Shot']: #, 'Shot15', 'Shot30', 'Shot45']:
	grouped = df.groupby(s)
	oscore = grouped['O_score'].agg([np.mean])
	oscore.columns = ['GH_Osentiment_' + s]

	ovol = grouped['obama'].agg([np.sum])
	ovol.columns = ['GH_Ovolume_' + s]

	rscore = grouped['R_score'].agg([np.mean])
	rscore.columns = ['GH_Rsentiment_' + s]

	rvol = grouped['romney'].agg([np.sum])
	rvol.columns = ['GH_Rvolume_' + s]

	## join them all together
	shotdf = shotdf.merge(oscore, left_index = True, right_index = True)
	shotdf = shotdf.merge(ovol, left_index = True, right_index = True)
	shotdf = shotdf.merge(rscore, left_index = True, right_index = True)
	shotdf = shotdf.merge(rvol, left_index = True, right_index = True)

	shotdf.to_csv("../data/" + debate + "-biobehavioral-twitterstats.csv")