Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Last active May 4, 2020 01:50
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thiagomarzagao/5851207 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/5851207 to your computer and use it in GitHub Desktop.
The Python script below implements the “Fightin’ Words” algorithm (see Monroe, B., Colaresi, M., Quinn, K. Fightin’ words: lexical feature selection and evaluation for identifying the content of political conflict. Political Analysis, 16(4), pp. 372-403). It takes as inputs word-frequency matrices. These matrices must be in CSV format. The first…
### FIGHTIN' WORDS (MCQ-2008)
### author: Thiago Marzagao
### contact: marzagao ddott 1 at osu ddott edu
import os
import sys
import pandas as pd
import numpy as np
from numpy import matrix as m
rpath = '/Users/username/datafolder/' # folder of word-frequency matrices
# count number of word-frequency matrices
totalFiles = sum([1 for fileName in os.listdir(rpath)
if fileName[-3:] == 'csv']
if fileName != 'corpus.csv')
# quit if no word-frequency matrices found
if totalFiles == 0:
sys.exit('No word-frequency matrices in {}'.format(rpath))
# load word-frequency matrices
docNames = []
y = pd.DataFrame(columns = ['word'])
for fileName in os.listdir(rpath):
if fileName[-3:] == 'csv' and fileName != 'corpus.csv':
# clean up document name
document = fileName.replace('.csv', '')
document = document.replace('-', '')
docNames.append(document)
# load frequencies
y_i = pd.read_csv(rpath + fileName,
usecols = [0, 1],
dtype = {0: 'S30', 1: 'int'},
names = ['word', document],
header = None)
# merge with previous ones
y = pd.merge(y, y_i, on = 'word', how = 'outer')
# kill NaNs
y = y.fillna(0)
# choose prior
print ''
priorChoice = int(input('Uninformative (1) or informative (2) prior? '))
if priorChoice == 1:
alpha_i = m.transpose(m([0.01] * len(y)))
elif priorChoice == 2:
priors = pd.read_csv(rpath + 'corpus.csv', # load global frequencies
usecols = [0, 1],
names = ['word', 'gfreq'],
header = None)
y = pd.merge(y, priors, on = 'word', how = 'left') # merge w/ y
y = y.fillna(y['gfreq'].min()) # replace missing by argmin(alphas)
alpha_i = m.transpose(m(y.gfreq)) # extract alphas
del y['gfreq'] # clean up y
else:
sys.exit('Invalid choice')
# estimate p_i
yword = m.transpose(m(np.hstack((['word'], np.array(y.word))))) # word list
y_i = m(y.iloc[:, 1:])
n_i = y_i.sum(axis = 0)
alpha0_i = alpha_i.sum(axis = 0)
p_i = (y_i + alpha_i) / (n_i + alpha0_i)
# estimate delta_i
y = m(y.iloc[:, 1:]).sum(axis = 1)
n = y.sum(axis = 0)
alpha = alpha_i.sum(axis = 1)
alpha0 = alpha.sum(axis = 0)
lomega_i = np.log((y_i + alpha_i) / ((n_i + alpha0_i) - (y_i + alpha_i)))
lomega = np.log((y + alpha) / ((n + alpha0) - (y + alpha)))
delta_i = lomega_i - lomega
# estimate delta_(i - j)_{for all (i, j)}
delta_ij = m(np.zeros((len(lomega_i), 1))) # initialize delta_ij
for col in range(0, lomega_i.shape[1] - 1): # delta_(i - j)_{for all (i, j)}
delta_ij = np.hstack((delta_ij, lomega_i[:, col]
- lomega_i[:, (col + 1):]))
delta_ij = np.delete(delta_ij, 0, 1)
# estimate sigma2_i
sigma2_i = (1 / (y_i + alpha_i)) + (1 / (y + alpha))
# estimate sigma2_(i - j)_{for all (i, j)}
yi_alphai = 1 / (y_i + alpha_i)
sigma2_ij = m(np.zeros((len(sigma2_i), 1))) # initialize sigma2_ij
for col in range(0, yi_alphai.shape[1] - 1): # sigma2_(i - j)_{for all (i, j)}
sigma2_ij = np.hstack((sigma2_ij, yi_alphai[:, col]
+ yi_alphai[:, (col + 1):]))
sigma2_ij = np.delete(sigma2_ij, 0, 1)
# estimate z_i
z_i = delta_i / np.sqrt(sigma2_i)
# estimate z_(i - j)_{for all (i, j)}
z_ij = delta_ij / np.sqrt(sigma2_ij)
# generate colnames for z_(i - j)
colNames = []
for i in range(0, len(docNames)):
for j in range(1, len(docNames)):
if i < j:
colName = docNames[i] + '_' + docNames[j]
colNames.append(colName)
# format z_(i - j) matrix and save
toFile = np.vstack((m(colNames), z_ij)) # append document names
toFile = pd.DataFrame(np.hstack((yword, toFile))) # append word list
toFile.to_csv(rpath + 'zScores_ij.csv', # save to file
index = False,
header = False)
# wrap up
print ''
print 'Done! All files successfully processed'
print 'Output saved to', rpath
print ''
@jseabold
Copy link

Curious if you ever implemented the Laplace prior version?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment