Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Created February 11, 2016 17:41
Show Gist options
  • Save thiagomarzagao/68ee390967893c9b6de9 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/68ee390967893c9b6de9 to your computer and use it in GitHub Desktop.
Wordscores in Python
### WORDSCORES (LBG-2003)
### author: Thiago Marzagao
### contact: marzagao ddott 1 at osu ddott edu
import os
import numpy as np
import pandas as pd
ipath = '/Users/username/inputdata/' # folder containing the CSV files
opath = '/Users/username/outputdata/' # folder where output will be saved
# hardcode your reference cases and their scores
Ar = pd.DataFrame({'referenceCase1': 1.1, # these are just examples
'referenceCase2': 3.7,
'referenceCase3': 8.2},
index = ['score'])
# create function to load and merge data
def loadData(caseSet, path, cols):
'''
iterable, string, dict -> pandas.DataFrame
'''
output = pd.DataFrame(columns = ['word'])
for case in caseSet:
# check if case is casename or filename
if '.csv' not in case:
case = case + '.csv'
# load new data file
newData = pd.read_csv(path + case,
usecols = [col for col in cols.keys()],
dtype = cols,
names = ['word', case.replace('.csv', '')],
header = None)
# merge with previous data
output = pd.merge(output, newData, on = 'word', how = 'outer')
output = output.fillna(0) # kill NaNs
return output
# load reference data
Fwr = loadData(Ar.keys(), ipath, {0: 'S30', 2: 'float'})
# compute p(r|w) = f_wr / sum(f_wr)_{for all r}
Pwr = Fwr.iloc[:, 1:].div(Fwr.sum(axis = 1), axis = 0)
# compute Sw and save to file
Sw = pd.DataFrame(Fwr.word)
Sw['score'] = Pwr.dot(Ar.T)
Sw.to_csv(opath + 'wordscores.csv', index = False)
# load virgin data
virginSet = [file for file in os.listdir(ipath)
if file.replace('.csv', '') not in Ar.keys()]
virginAbsFreq = loadData(virginSet, ipath, {0: 'S30', 1: 'int'})
Fwv = loadData(virginSet, ipath, {0: 'S30', 2: 'float'})
# 1:1 merge Fwv with Sw (to discard all disjoint words)
temp = pd.merge(Fwv, Sw, on = 'word', how = 'inner')
# split filtered Sw
cleanSw = pd.DataFrame(temp.score)
# clean up filtered Fwv
del temp['word']
del temp['score']
cleanFwv = temp
# compute Sv = sum(Fwv * Sw)_{for all w}
Sv = cleanFwv.T.dot(cleanSw)
# compute transformed Sv
Sv_t = (Sv - Sv.mean()) * (Ar.T.std() / Sv.std()) + Sv.mean()
# compute Vv
Vv = (cleanFwv * np.square((np.array(cleanSw)
- np.array(Sv.T)))).sum(axis = 0)
# 1:1 merge absolute frequencies with Sw (to discard all disjoint words)
temp = pd.merge(virginAbsFreq, Sw, on = 'word', how = 'inner')
# compute N
del temp['word']
del temp['score']
N = temp.sum(axis = 0)
# compute standard errors and confidence intervals
std_error = np.sqrt(Vv / N)
lower = np.array(Sv).flatten() - np.array((1.96 * std_error))
upper = np.array(Sv).flatten() + np.array((1.96 * std_error))
# compute transformed confidence intervals
lower_t = (np.array(lower) - np.array(Sv.mean())) \
* np.array((Ar.T.std() / Sv.std())) \
+ np.array(Sv.mean())
upper_t = (np.array(upper) - np.array(Sv.mean())) \
* np.array((Ar.T.std() / Sv.std())) \
+ np.array(Sv.mean())
# print everything
print ''
print 'Original scores (w/ 95CI):'
print ''
Sv['lower'] = lower
Sv['upper'] = upper
print Sv
print ''
print 'Transformed scores (w/ 95CI):'
print ''
Sv_t['lower'] = lower_t
Sv_t['upper'] = upper_t
print Sv_t
print ''
# save transformed estimates to file
Sv_t.to_csv(opath + 'virginScores.csv', index_label = 'case')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment