Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Last active December 18, 2015 07:19
Show Gist options
  • Save thiagomarzagao/5746292 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/5746292 to your computer and use it in GitHub Desktop.
The Python script below implements the ‘wordscores’ algorithm (see Laver, M., Benoit, K., Garry, J. Extracting policy positions from political texts using words as data. American Political Science Review, 97(2), 2003, pp. 311-331). It takes as inputs word-frequency matrices. These matrices must be in CSV format. The first column must contain the…
### WORDSCORES (LBG-2003)
### author: Thiago Marzagao
### contact: marzagao ddott 1 at osu ddott edu
import os
import numpy as np
import pandas as pd
ipath = '/Users/username/inputdata/' # folder containing the CSV files
opath = '/Users/username/outputdata/' # folder where output will be saved
# hardcode your reference cases and their scores
Ar = pd.DataFrame({'referenceCase1': 1.1, # these are just examples
'referenceCase2': 3.7,
'referenceCase3': 8.2},
index = ['score'])
# create function to load and merge data
def loadData(caseSet, path, cols):
'''
iterable, string, dict -> pandas.DataFrame
'''
output = pd.DataFrame(columns = ['word'])
for case in caseSet:
# check if case is casename or filename
if '.csv' not in case:
case = case + '.csv'
# load new data file
newData = pd.read_csv(path + case,
usecols = [col for col in cols.keys()],
dtype = cols,
names = ['word', case.replace('.csv', '')],
header = None)
# merge with previous data
output = pd.merge(output, newData, on = 'word', how = 'outer')
output = output.fillna(0) # kill NaNs
return output
# load reference data
Fwr = loadData(Ar.keys(), ipath, {0: 'S30', 2: 'float'})
# compute p(r|w) = f_wr / sum(f_wr)_{for all r}
Pwr = Fwr.iloc[:, 1:].div(Fwr.sum(axis = 1), axis = 0)
# compute Sw and save to file
Sw = pd.DataFrame(Fwr.word)
Sw['score'] = Pwr.dot(Ar.T)
Sw.to_csv(opath + 'wordscores.csv', index = False)
# load virgin data
virginSet = [file for file in os.listdir(ipath)
if file.replace('.csv', '') not in Ar.keys()]
virginAbsFreq = loadData(virginSet, ipath, {0: 'S30', 1: 'int'})
Fwv = loadData(virginSet, ipath, {0: 'S30', 2: 'float'})
# 1:1 merge Fwv with Sw (to discard all disjoint words)
temp = pd.merge(Fwv, Sw, on = 'word', how = 'inner')
# split filtered Sw
cleanSw = pd.DataFrame(temp.score)
# clean up filtered Fwv
del temp['word']
del temp['score']
cleanFwv = temp
# compute Sv = sum(Fwv * Sw)_{for all w}
Sv = cleanFwv.T.dot(cleanSw)
# compute transformed Sv
Sv_t = (Sv - Sv.mean()) * (Ar.T.std() / Sv.std()) + Sv.mean()
# compute Vv
Vv = (cleanFwv * np.square((np.array(cleanSw)
- np.array(Sv.T)))).sum(axis = 0)
# 1:1 merge absolute frequencies with Sw (to discard all disjoint words)
temp = pd.merge(virginAbsFreq, Sw, on = 'word', how = 'inner')
# compute N
del temp['word']
del temp['score']
N = temp.sum(axis = 0)
# compute standard errors and confidence intervals
std_error = np.sqrt(Vv / N)
lower = np.array(Sv).flatten() - np.array((1.96 * std_error))
upper = np.array(Sv).flatten() + np.array((1.96 * std_error))
# compute transformed confidence intervals
lower_t = (np.array(lower) - np.array(Sv.mean())) \
* np.array((Ar.T.std() / Sv.std())) \
+ np.array(Sv.mean())
upper_t = (np.array(upper) - np.array(Sv.mean())) \
* np.array((Ar.T.std() / Sv.std())) \
+ np.array(Sv.mean())
# print everything
print ''
print 'Original scores (w/ 95CI):'
print ''
Sv['lower'] = lower
Sv['upper'] = upper
print Sv
print ''
print 'Transformed scores (w/ 95CI):'
print ''
Sv_t['lower'] = lower_t
Sv_t['upper'] = upper_t
print Sv_t
print ''
# save transformed estimates to file
Sv_t.to_csv(opath + 'virginScores.csv', index_label = 'case')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment