boblannon/residual_IDF.py

## residual_IDF.py
from collections import defaultdict
from math import log
from math import exp
import pandas as pd

# this is based on data in the form released here: http://corpora.uni-leipzig.de/

# inv_w.txt is a table of (word_id, sentence_id, offset), which lets us create an inverted
# index with offset information

inv_w = defaultdict(lambda:defaultdict(list))
for line in open('inv_w.txt'):
    wid,sid,offset = line.strip().split('\t')
    inv_w[int(wid)][int(sid)].append(int(offset))

# words.txt is a table of (word_id, word, corpus_frequency), which allows us to make a master
# dictionary of corpus-wide counts

words = {}
for line in open('words.txt'):
    wid,word,cf = line.strip().split('\t')
    words[word] = {'wid':wid,'cf':cf}


id_lookup = {v['wid']:k for k,v in words.iteritems()}

# N - total number of documents in a corpus
# cf - corpus frequency, or the number of times the word occurs in across a corpus
# df - document frequency, or the number of documents that contain at least one occurrence of the word
# IDF - inverse document frequency

def smooth_cf(cf):
    #TODO: write a smoothing fct
    return float(cf)

def smooth_df(df):
    #TODO: write a smoothing fct
    return float(df)

def poisson(cf,N):
    p = exp(-(smooth_cf(cf)/float(N))) # may want to include some smoothing, here
    return log(1-p,2)

def IDF(df,N):
    idf = float(N)/smooth_df(df)
    return log(idf,2)

def RIDF(s):
    w = words[s]
    cf = w['cf']
    wid = w['wid']
    df = len(inv_w[wid])
    if cf == 0:
        return 0
    else:
        return IDF(df,300000) + poisson(cf,300000)

ridf_records =  [(v['wid'],k,RIDF(k)) for k,v in words.iteritems()]


df = pd.DataFrame(rows)
	from collections import defaultdict
	from math import log
	from math import exp
	import pandas as pd

	# this is based on data in the form released here: http://corpora.uni-leipzig.de/

	# inv_w.txt is a table of (word_id, sentence_id, offset), which lets us create an inverted
	# index with offset information

	inv_w = defaultdict(lambda:defaultdict(list))
	for line in open('inv_w.txt'):
	wid,sid,offset = line.strip().split('\t')
	inv_w[int(wid)][int(sid)].append(int(offset))

	# words.txt is a table of (word_id, word, corpus_frequency), which allows us to make a master
	# dictionary of corpus-wide counts

	words = {}
	for line in open('words.txt'):
	wid,word,cf = line.strip().split('\t')
	words[word] = {'wid':wid,'cf':cf}


	id_lookup = {v['wid']:k for k,v in words.iteritems()}

	# N - total number of documents in a corpus
	# cf - corpus frequency, or the number of times the word occurs in across a corpus
	# df - document frequency, or the number of documents that contain at least one occurrence of the word
	# IDF - inverse document frequency

	def smooth_cf(cf):
	#TODO: write a smoothing fct
	return float(cf)

	def smooth_df(df):
	#TODO: write a smoothing fct
	return float(df)

	def poisson(cf,N):
	p = exp(-(smooth_cf(cf)/float(N))) # may want to include some smoothing, here
	return log(1-p,2)

	def IDF(df,N):
	idf = float(N)/smooth_df(df)
	return log(idf,2)

	def RIDF(s):
	w = words[s]
	cf = w['cf']
	wid = w['wid']
	df = len(inv_w[wid])
	if cf == 0:
	return 0
	else:
	return IDF(df,300000) + poisson(cf,300000)

	ridf_records = [(v['wid'],k,RIDF(k)) for k,v in words.iteritems()]


	df = pd.DataFrame(rows)