thiagomarzagao/ads.py

## ads.py
#!/usr/bin/env python

import os
import time
import pickle
import numpy as np
import pandas as pd

# set paths
basepath = '/fs/lustre/osu6994/hdf5/'
relfreq_rows = basepath + 'relfreq_rows/'
relfreq_cols = basepath + 'relfreq_cols/'
absfreq_cols = basepath + 'absfreq_cols/'
udsfile = basepath + 'uds.csv'
polityfile = basepath + 'polity.csv'
wordsfile = basepath + 'words'
batches = basepath + 'batches/'
output = basepath + 'output/'

# set reference years
refyears = [1992]

# create Ar
def create_Ar(reffile, refyears):
    if reffile == udsfile:
        full = pd.read_csv(udsfile, usecols = [0, 1, 3])
    elif reffile == polityfile:
        full = pd.read_csv(polityfile, usecols = [0, 1, 2])
    ref = full[full['year'] > 1991]
    Ar = {ref.iat[row, 0] + str(ref.iat[row, 1]): ref.iat[row, 2] for row in range(len(ref)) if ref.iat[row, 1] in refyears}
    Ar = pd.DataFrame(Ar.items(), columns = ['doc', 'docscore'])
    Ar.set_index('doc', inplace = True)
    return Ar

# compute Sw (and save to file)
def compute_Sw(Ar, relfreq_rows):
    Sw = pd.DataFrame()
    for file in [file for file in os.listdir(relfreq_rows) if '.h5' in file]:
        store = pd.HDFStore(relfreq_rows + file)
        Fwr = store['freq']
        for col in Fwr.columns:
            if col != 'word' and col not in Ar.index:
                del Fwr[col]
        sumFwr = Fwr.sum(axis = 1)
        Pwr = Fwr.T / sumFwr
        Pwr = Pwr.T
        Pwr = Pwr.dropna()
        Sw_r = Pwr.dot(Ar)
        Sw_r.columns = ['wordscore']
        Sw = pd.concat([Sw, Sw_r])
        store.close()
    Sw.to_csv(output + 'wordscores.csv', index = True, index_label = 'word')
    return Sw

# load column of words
def load_words(wordsfile):
    f = open(wordsfile, mode = 'rb')
    words = pickle.load(f)
    f.close()
    return words

# get frequencies
def get_freq(path, file, Ar, words):
    store = pd.HDFStore(path + file)
    freq = store['freq']
    ref_index = set(Ar.index)
    for case in freq.columns:
      	if int(case[-4:]) in refyears:
            del freq[case]
    freq['word'] = words
    freq.set_index('word', inplace = True)
    store.close()
    return freq

# compute Sv
def compute_Sv(Fwv, Sw):
    tempjoin1 = pd.merge(Fwv, Sw, how = 'inner', left_index = True, right_index = True, sort = False)
    k = len(tempjoin1.columns) - 1
    Sv = pd.DataFrame(tempjoin1.iloc[:, :k].T.dot(tempjoin1['wordscore']))
    Sv.columns = ['docscore']
    return tempjoin1, Sv

# compute Vv
def compute_Vv(tempjoin1, Sv):
    cleanSw = pd.DataFrame(tempjoin1.wordscore)
    cleanSw.columns = ['score']
    cleanFwv = tempjoin1
    del cleanFwv['wordscore']
    Vv = (cleanFwv * np.square((np.array(cleanSw) - np.array(Sv.T)))).sum(axis = 0)
    return Vv

# compute confidence intervals
def compute_CI(virgin_absfreq, Sw, Vv, Sv):
    tempjoin2 = pd.merge(virgin_absfreq, Sw, how = 'inner', left_index = True, right_index = True, sort = False)
    del tempjoin2['wordscore']
    N = tempjoin2.sum(axis = 0)
    std_error = np.sqrt(Vv / N)
    lower = np.array(Sv).flatten() - np.array((1.96 * std_error))
    upper = np.array(Sv).flatten() + np.array((1.96 * std_error))
    return lower, upper

# stack Sv and CIs and save batch to disk
def save_SvCI(Sv, lower, upper, counter):
    SvCI = Sv
    SvCI['lower'] = lower
    SvCI['upper'] = upper
    SvCI.to_csv(batches + 'SvCI_batch_' + str(counter) + '.csv', header = False)

# compute SvCI batches and save to disk
def compute_SvCI(reffile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols):
    Ar = create_Ar(reffile, refyears)
    words = load_words(wordsfile)
    Sw = compute_Sw(Ar, relfreq_rows)
    relfiles = [file for file in os.listdir(relfreq_cols) if '.h5' in file]
    absfiles = [file for file in os.listdir(absfreq_cols) if '.h5' in file]
    counter = 0
    for relfile, absfile in zip(relfiles, absfiles):
        counter += 1
        Fwv = get_freq(relfreq_cols, relfile, Ar, words)
        tempjoin1, Sv = compute_Sv(Fwv, Sw)
        Vv = compute_Vv(tempjoin1, Sv)
        virgin_absfreq = get_freq(absfreq_cols, absfile, Ar, words)
        lower, upper = compute_CI(virgin_absfreq, Sw, Vv, Sv)
        save_SvCI(Sv, lower, upper, counter)

# consolidate all SvCI batches into one file
def consolidate_SvCI(output):
    fullSv = open(output + 'SvCI.csv', mode = 'w')
    fullSv.write(',Sv,lower,upper\n')
    for file in os.listdir(batches):
        if 'SvCI_batch_' in file:
            newSv = open(batches + file, mode = 'r').read()
            fullSv.write(newSv)
    fullSv.close()

# compute transformed estimates
def compute_Svt(output, reffile, refyears):
    Ar = create_Ar(reffile, refyears)
    Sv = pd.read_csv(output + 'SvCI.csv', usecols = [0, 1], index_col = [0])
    lower = pd.read_csv(output + 'SvCI.csv', usecols = [0, 2], index_col = [0])
    upper = pd.read_csv(output + 'SvCI.csv', usecols = [0, 3], index_col = [0])
    Sv_t = (Sv - Sv.mean()) * (Ar.std()[0] / Sv.std()[0]) + Sv.mean()
    lower_t = (np.array(lower) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean())
    upper_t = (np.array(upper) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean())
    Sv_t['lower'] = lower_t
    Sv_t['upper'] = upper_t
    Sv_t.to_csv(output + 'Sv_t.csv')
    return Sv_t

compute_SvCI(udsfile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols)
consolidate_SvCI(output)
compute_Svt(output, udsfile, refyears)
	#!/usr/bin/env python

	import os
	import time
	import pickle
	import numpy as np
	import pandas as pd

	# set paths
	basepath = '/fs/lustre/osu6994/hdf5/'
	relfreq_rows = basepath + 'relfreq_rows/'
	relfreq_cols = basepath + 'relfreq_cols/'
	absfreq_cols = basepath + 'absfreq_cols/'
	udsfile = basepath + 'uds.csv'
	polityfile = basepath + 'polity.csv'
	wordsfile = basepath + 'words'
	batches = basepath + 'batches/'
	output = basepath + 'output/'

	# set reference years
	refyears = [1992]

	# create Ar
	def create_Ar(reffile, refyears):
	if reffile == udsfile:
	full = pd.read_csv(udsfile, usecols = [0, 1, 3])
	elif reffile == polityfile:
	full = pd.read_csv(polityfile, usecols = [0, 1, 2])
	ref = full[full['year'] > 1991]
	Ar = {ref.iat[row, 0] + str(ref.iat[row, 1]): ref.iat[row, 2] for row in range(len(ref)) if ref.iat[row, 1] in refyears}
	Ar = pd.DataFrame(Ar.items(), columns = ['doc', 'docscore'])
	Ar.set_index('doc', inplace = True)
	return Ar

	# compute Sw (and save to file)
	def compute_Sw(Ar, relfreq_rows):
	Sw = pd.DataFrame()
	for file in [file for file in os.listdir(relfreq_rows) if '.h5' in file]:
	store = pd.HDFStore(relfreq_rows + file)
	Fwr = store['freq']
	for col in Fwr.columns:
	if col != 'word' and col not in Ar.index:
	del Fwr[col]
	sumFwr = Fwr.sum(axis = 1)
	Pwr = Fwr.T / sumFwr
	Pwr = Pwr.T
	Pwr = Pwr.dropna()
	Sw_r = Pwr.dot(Ar)
	Sw_r.columns = ['wordscore']
	Sw = pd.concat([Sw, Sw_r])
	store.close()
	Sw.to_csv(output + 'wordscores.csv', index = True, index_label = 'word')
	return Sw

	# load column of words
	def load_words(wordsfile):
	f = open(wordsfile, mode = 'rb')
	words = pickle.load(f)
	f.close()
	return words

	# get frequencies
	def get_freq(path, file, Ar, words):
	store = pd.HDFStore(path + file)
	freq = store['freq']
	ref_index = set(Ar.index)
	for case in freq.columns:
	if int(case[-4:]) in refyears:
	del freq[case]
	freq['word'] = words
	freq.set_index('word', inplace = True)
	store.close()
	return freq

	# compute Sv
	def compute_Sv(Fwv, Sw):
	tempjoin1 = pd.merge(Fwv, Sw, how = 'inner', left_index = True, right_index = True, sort = False)
	k = len(tempjoin1.columns) - 1
	Sv = pd.DataFrame(tempjoin1.iloc[:, :k].T.dot(tempjoin1['wordscore']))
	Sv.columns = ['docscore']
	return tempjoin1, Sv

	# compute Vv
	def compute_Vv(tempjoin1, Sv):
	cleanSw = pd.DataFrame(tempjoin1.wordscore)
	cleanSw.columns = ['score']
	cleanFwv = tempjoin1
	del cleanFwv['wordscore']
	Vv = (cleanFwv * np.square((np.array(cleanSw) - np.array(Sv.T)))).sum(axis = 0)
	return Vv

	# compute confidence intervals
	def compute_CI(virgin_absfreq, Sw, Vv, Sv):
	tempjoin2 = pd.merge(virgin_absfreq, Sw, how = 'inner', left_index = True, right_index = True, sort = False)
	del tempjoin2['wordscore']
	N = tempjoin2.sum(axis = 0)
	std_error = np.sqrt(Vv / N)
	lower = np.array(Sv).flatten() - np.array((1.96 * std_error))
	upper = np.array(Sv).flatten() + np.array((1.96 * std_error))
	return lower, upper

	# stack Sv and CIs and save batch to disk
	def save_SvCI(Sv, lower, upper, counter):
	SvCI = Sv
	SvCI['lower'] = lower
	SvCI['upper'] = upper
	SvCI.to_csv(batches + 'SvCI_batch_' + str(counter) + '.csv', header = False)

	# compute SvCI batches and save to disk
	def compute_SvCI(reffile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols):
	Ar = create_Ar(reffile, refyears)
	words = load_words(wordsfile)
	Sw = compute_Sw(Ar, relfreq_rows)
	relfiles = [file for file in os.listdir(relfreq_cols) if '.h5' in file]
	absfiles = [file for file in os.listdir(absfreq_cols) if '.h5' in file]
	counter = 0
	for relfile, absfile in zip(relfiles, absfiles):
	counter += 1
	Fwv = get_freq(relfreq_cols, relfile, Ar, words)
	tempjoin1, Sv = compute_Sv(Fwv, Sw)
	Vv = compute_Vv(tempjoin1, Sv)
	virgin_absfreq = get_freq(absfreq_cols, absfile, Ar, words)
	lower, upper = compute_CI(virgin_absfreq, Sw, Vv, Sv)
	save_SvCI(Sv, lower, upper, counter)

	# consolidate all SvCI batches into one file
	def consolidate_SvCI(output):
	fullSv = open(output + 'SvCI.csv', mode = 'w')
	fullSv.write(',Sv,lower,upper\n')
	for file in os.listdir(batches):
	if 'SvCI_batch_' in file:
	newSv = open(batches + file, mode = 'r').read()
	fullSv.write(newSv)
	fullSv.close()

	# compute transformed estimates
	def compute_Svt(output, reffile, refyears):
	Ar = create_Ar(reffile, refyears)
	Sv = pd.read_csv(output + 'SvCI.csv', usecols = [0, 1], index_col = [0])
	lower = pd.read_csv(output + 'SvCI.csv', usecols = [0, 2], index_col = [0])
	upper = pd.read_csv(output + 'SvCI.csv', usecols = [0, 3], index_col = [0])
	Sv_t = (Sv - Sv.mean()) * (Ar.std()[0] / Sv.std()[0]) + Sv.mean()
	lower_t = (np.array(lower) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean())
	upper_t = (np.array(upper) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean())
	Sv_t['lower'] = lower_t
	Sv_t['upper'] = upper_t
	Sv_t.to_csv(output + 'Sv_t.csv')
	return Sv_t

	compute_SvCI(udsfile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols)
	consolidate_SvCI(output)
	compute_Svt(output, udsfile, refyears)