Skip to content

Instantly share code, notes, and snippets.

Created May 30, 2014 04:51
Show Gist options
  • Save thiagomarzagao/406be950a4fb67af3bde to your computer and use it in GitHub Desktop.
Save thiagomarzagao/406be950a4fb67af3bde to your computer and use it in GitHub Desktop.
Code used for my "Automated Democracy Scores" paper.
#!/usr/bin/env python
import os
import time
import pickle
import numpy as np
import pandas as pd
# set paths
basepath = '/fs/lustre/osu6994/hdf5/'
relfreq_rows = basepath + 'relfreq_rows/'
relfreq_cols = basepath + 'relfreq_cols/'
absfreq_cols = basepath + 'absfreq_cols/'
udsfile = basepath + 'uds.csv'
polityfile = basepath + 'polity.csv'
wordsfile = basepath + 'words'
batches = basepath + 'batches/'
output = basepath + 'output/'
# set reference years
refyears = [1992]
# create Ar
def create_Ar(reffile, refyears):
if reffile == udsfile:
full = pd.read_csv(udsfile, usecols = [0, 1, 3])
elif reffile == polityfile:
full = pd.read_csv(polityfile, usecols = [0, 1, 2])
ref = full[full['year'] > 1991]
Ar = {ref.iat[row, 0] + str(ref.iat[row, 1]): ref.iat[row, 2] for row in range(len(ref)) if ref.iat[row, 1] in refyears}
Ar = pd.DataFrame(Ar.items(), columns = ['doc', 'docscore'])
Ar.set_index('doc', inplace = True)
return Ar
# compute Sw (and save to file)
def compute_Sw(Ar, relfreq_rows):
Sw = pd.DataFrame()
for file in [file for file in os.listdir(relfreq_rows) if '.h5' in file]:
store = pd.HDFStore(relfreq_rows + file)
Fwr = store['freq']
for col in Fwr.columns:
if col != 'word' and col not in Ar.index:
del Fwr[col]
sumFwr = Fwr.sum(axis = 1)
Pwr = Fwr.T / sumFwr
Pwr = Pwr.T
Pwr = Pwr.dropna()
Sw_r =
Sw_r.columns = ['wordscore']
Sw = pd.concat([Sw, Sw_r])
Sw.to_csv(output + 'wordscores.csv', index = True, index_label = 'word')
return Sw
# load column of words
def load_words(wordsfile):
f = open(wordsfile, mode = 'rb')
words = pickle.load(f)
return words
# get frequencies
def get_freq(path, file, Ar, words):
store = pd.HDFStore(path + file)
freq = store['freq']
ref_index = set(Ar.index)
for case in freq.columns:
if int(case[-4:]) in refyears:
del freq[case]
freq['word'] = words
freq.set_index('word', inplace = True)
return freq
# compute Sv
def compute_Sv(Fwv, Sw):
tempjoin1 = pd.merge(Fwv, Sw, how = 'inner', left_index = True, right_index = True, sort = False)
k = len(tempjoin1.columns) - 1
Sv = pd.DataFrame(tempjoin1.iloc[:, :k]['wordscore']))
Sv.columns = ['docscore']
return tempjoin1, Sv
# compute Vv
def compute_Vv(tempjoin1, Sv):
cleanSw = pd.DataFrame(tempjoin1.wordscore)
cleanSw.columns = ['score']
cleanFwv = tempjoin1
del cleanFwv['wordscore']
Vv = (cleanFwv * np.square((np.array(cleanSw) - np.array(Sv.T)))).sum(axis = 0)
return Vv
# compute confidence intervals
def compute_CI(virgin_absfreq, Sw, Vv, Sv):
tempjoin2 = pd.merge(virgin_absfreq, Sw, how = 'inner', left_index = True, right_index = True, sort = False)
del tempjoin2['wordscore']
N = tempjoin2.sum(axis = 0)
std_error = np.sqrt(Vv / N)
lower = np.array(Sv).flatten() - np.array((1.96 * std_error))
upper = np.array(Sv).flatten() + np.array((1.96 * std_error))
return lower, upper
# stack Sv and CIs and save batch to disk
def save_SvCI(Sv, lower, upper, counter):
SvCI = Sv
SvCI['lower'] = lower
SvCI['upper'] = upper
SvCI.to_csv(batches + 'SvCI_batch_' + str(counter) + '.csv', header = False)
# compute SvCI batches and save to disk
def compute_SvCI(reffile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols):
Ar = create_Ar(reffile, refyears)
words = load_words(wordsfile)
Sw = compute_Sw(Ar, relfreq_rows)
relfiles = [file for file in os.listdir(relfreq_cols) if '.h5' in file]
absfiles = [file for file in os.listdir(absfreq_cols) if '.h5' in file]
counter = 0
for relfile, absfile in zip(relfiles, absfiles):
counter += 1
Fwv = get_freq(relfreq_cols, relfile, Ar, words)
tempjoin1, Sv = compute_Sv(Fwv, Sw)
Vv = compute_Vv(tempjoin1, Sv)
virgin_absfreq = get_freq(absfreq_cols, absfile, Ar, words)
lower, upper = compute_CI(virgin_absfreq, Sw, Vv, Sv)
save_SvCI(Sv, lower, upper, counter)
# consolidate all SvCI batches into one file
def consolidate_SvCI(output):
fullSv = open(output + 'SvCI.csv', mode = 'w')
for file in os.listdir(batches):
if 'SvCI_batch_' in file:
newSv = open(batches + file, mode = 'r').read()
# compute transformed estimates
def compute_Svt(output, reffile, refyears):
Ar = create_Ar(reffile, refyears)
Sv = pd.read_csv(output + 'SvCI.csv', usecols = [0, 1], index_col = [0])
lower = pd.read_csv(output + 'SvCI.csv', usecols = [0, 2], index_col = [0])
upper = pd.read_csv(output + 'SvCI.csv', usecols = [0, 3], index_col = [0])
Sv_t = (Sv - Sv.mean()) * (Ar.std()[0] / Sv.std()[0]) + Sv.mean()
lower_t = (np.array(lower) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean())
upper_t = (np.array(upper) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean())
Sv_t['lower'] = lower_t
Sv_t['upper'] = upper_t
Sv_t.to_csv(output + 'Sv_t.csv')
return Sv_t
compute_SvCI(udsfile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols)
compute_Svt(output, udsfile, refyears)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment