Skip to content

Instantly share code, notes, and snippets.

@boblannon
Last active December 17, 2015 05:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save boblannon/5557702 to your computer and use it in GitHub Desktop.
Save boblannon/5557702 to your computer and use it in GitHub Desktop.
quick and dirty method of using residual IDF to find keywords in a corpus. implementation of chruch and gale 1991
from collections import defaultdict
from math import log
from math import exp
import pandas as pd
# this is based on data in the form released here: http://corpora.uni-leipzig.de/
# inv_w.txt is a table of (word_id, sentence_id, offset), which lets us create an inverted
# index with offset information
inv_w = defaultdict(lambda:defaultdict(list))
for line in open('inv_w.txt'):
wid,sid,offset = line.strip().split('\t')
inv_w[int(wid)][int(sid)].append(int(offset))
# words.txt is a table of (word_id, word, corpus_frequency), which allows us to make a master
# dictionary of corpus-wide counts
words = {}
for line in open('words.txt'):
wid,word,cf = line.strip().split('\t')
words[word] = {'wid':wid,'cf':cf}
id_lookup = {v['wid']:k for k,v in words.iteritems()}
# N - total number of documents in a corpus
# cf - corpus frequency, or the number of times the word occurs in across a corpus
# df - document frequency, or the number of documents that contain at least one occurrence of the word
# IDF - inverse document frequency
def smooth_cf(cf):
#TODO: write a smoothing fct
return float(cf)
def smooth_df(df):
#TODO: write a smoothing fct
return float(df)
def poisson(cf,N):
p = exp(-(smooth_cf(cf)/float(N))) # may want to include some smoothing, here
return log(1-p,2)
def IDF(df,N):
idf = float(N)/smooth_df(df)
return log(idf,2)
def RIDF(s):
w = words[s]
cf = w['cf']
wid = w['wid']
df = len(inv_w[wid])
if cf == 0:
return 0
else:
return IDF(df,300000) + poisson(cf,300000)
ridf_records = [(v['wid'],k,RIDF(k)) for k,v in words.iteritems()]
df = pd.DataFrame(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment