sente/sli.py

## sli.py
import math
import nltk
import numpy as np
import pandas as pd
import scipy
from nltk.corpus import stopwords
from scipy import linalg

stopwords = stopwords.words('english')

df = pd.read_csv('Reuters/r52-train-stemmed.txt', sep='\t', names=['cat', 'text'], index_col=False)
df.head()
docs = df['text'][0:2000]
print len(docs)
#print alksdjf
docs.head()
#term_indices, corpus_bag, doc_bags = find_frequencies(docs)
def find_frequencies(docs):
    term_indices = {} ## This is #1 above
    currentIndex = 0 ## This is the counter to make sure we correctly populate the term indices in order
    corpus_bag = {} ## This is #2 above
    doc_bags = [] ## This is the collection for #3 above
    numdocs = len(list(docs.iteritems()))
    for i, doc in docs.iteritems():
        print '%5d / %5s' % (int(i), numdocs)
        doc_bag = {} ## This is the dictionary of term frequencies for the doc we're currently examining, doc_bags stores a collection of these
        ## TODO: Tokenize each document with nltk
        doc_tokens = nltk.word_tokenize(doc)
        ## TODO: For each token in the current document:
        for word in doc_tokens:
            ## Optionally ignore stopword and continue
            ## Throw out stopwords
            ##if word in stopwords:
                ##    continue
            ## If the word is new (not in term_indices):
            if word not in term_indices:
                ## add it to term_indices and give it the index value currentIndex, increment currentIndex
                term_indices[word] = currentIndex
                currentIndex += 1
                ## add it to the corpus_bag with count 1
                corpus_bag[word] = 1
                ## add it to the current doc_bag with count 1
                doc_bag[word] = 1
            ## If the word is not new:
            else:
                ## increment the corpus_bag
                corpus_bag[word] = corpus_bag[word] + 1
                ## If the word is already in the doc_bag, increment that counter, else set it to 1
                if word in doc_bag:
                    doc_bag[word] = doc_bag[word] + 1
                else:
                    doc_bag[word] = 1
        doc_bags.append(doc_bag)

    return term_indices, corpus_bag, doc_bags

term_indices, corpus_bag, doc_bags = find_frequencies(docs)


#term_indices
#corpus_bag
#doc_bags
print len(term_indices)
print term_indices['cocoa']
print len(corpus_bag)
print corpus_bag['cocoa']
print len(doc_bags[0])
print doc_bags[0]

## Implement a function that uses the corpus_bag and doc_bags found above to compute the global weighting (idf) term
def compute_global_weight(corpus_bag, doc_bags):
    global_weights = {} ## A dictionary of term --> global weight (the idf components) using entropy weighting
    ## TODO: Define a variable logn which is the log base 2 of the number of documents in the set
    logn = math.log(len(doc_bags), 2)
    ## TODO: For each term:
    for i, term in enumerate(corpus_bag):
        print i, len(corpus_bag)
        ## Start the global weight at 1
        global_weight = 1
        ## Compute the global count from corpus_bag
        global_count = corpus_bag[term]
        ## For each doc_bag:
        for doc_bag in doc_bags:
            ## If the term is in it, calculate p_ij and decrease the global weight by p_ij * log(p_ij) / logn
            if term in doc_bag:
                local_count = doc_bag[term] + 0.0
                pij = local_count/global_count
                global_weight += pij*math.log(pij,2)/logn
        ## Add this term's global weight to your global_weights dict
        global_weights[term] = global_weight
    return global_weights

global_weights = compute_global_weight(corpus_bag, doc_bags)

global_weights['cocoa']
	import math
	import nltk
	import numpy as np
	import pandas as pd
	import scipy
	from nltk.corpus import stopwords
	from scipy import linalg

	stopwords = stopwords.words('english')

	df = pd.read_csv('Reuters/r52-train-stemmed.txt', sep='\t', names=['cat', 'text'], index_col=False)
	df.head()
	docs = df['text'][0:2000]
	print len(docs)
	#print alksdjf
	docs.head()
	#term_indices, corpus_bag, doc_bags = find_frequencies(docs)
	def find_frequencies(docs):
	term_indices = {} ## This is #1 above
	currentIndex = 0 ## This is the counter to make sure we correctly populate the term indices in order
	corpus_bag = {} ## This is #2 above
	doc_bags = [] ## This is the collection for #3 above
	numdocs = len(list(docs.iteritems()))
	for i, doc in docs.iteritems():
	print '%5d / %5s' % (int(i), numdocs)
	doc_bag = {} ## This is the dictionary of term frequencies for the doc we're currently examining, doc_bags stores a collection of these
	## TODO: Tokenize each document with nltk
	doc_tokens = nltk.word_tokenize(doc)
	## TODO: For each token in the current document:
	for word in doc_tokens:
	## Optionally ignore stopword and continue
	## Throw out stopwords
	##if word in stopwords:
	## continue
	## If the word is new (not in term_indices):
	if word not in term_indices:
	## add it to term_indices and give it the index value currentIndex, increment currentIndex
	term_indices[word] = currentIndex
	currentIndex += 1
	## add it to the corpus_bag with count 1
	corpus_bag[word] = 1
	## add it to the current doc_bag with count 1
	doc_bag[word] = 1
	## If the word is not new:
	else:
	## increment the corpus_bag
	corpus_bag[word] = corpus_bag[word] + 1
	## If the word is already in the doc_bag, increment that counter, else set it to 1
	if word in doc_bag:
	doc_bag[word] = doc_bag[word] + 1
	else:
	doc_bag[word] = 1
	doc_bags.append(doc_bag)

	return term_indices, corpus_bag, doc_bags

	term_indices, corpus_bag, doc_bags = find_frequencies(docs)


	#term_indices
	#corpus_bag
	#doc_bags
	print len(term_indices)
	print term_indices['cocoa']
	print len(corpus_bag)
	print corpus_bag['cocoa']
	print len(doc_bags[0])
	print doc_bags[0]

	## Implement a function that uses the corpus_bag and doc_bags found above to compute the global weighting (idf) term
	def compute_global_weight(corpus_bag, doc_bags):
	global_weights = {} ## A dictionary of term --> global weight (the idf components) using entropy weighting
	## TODO: Define a variable logn which is the log base 2 of the number of documents in the set
	logn = math.log(len(doc_bags), 2)
	## TODO: For each term:
	for i, term in enumerate(corpus_bag):
	print i, len(corpus_bag)
	## Start the global weight at 1
	global_weight = 1
	## Compute the global count from corpus_bag
	global_count = corpus_bag[term]
	## For each doc_bag:
	for doc_bag in doc_bags:
	## If the term is in it, calculate p_ij and decrease the global weight by p_ij * log(p_ij) / logn
	if term in doc_bag:
	local_count = doc_bag[term] + 0.0
	pij = local_count/global_count
	global_weight += pij*math.log(pij,2)/logn
	## Add this term's global weight to your global_weights dict
	global_weights[term] = global_weight
	return global_weights

	global_weights = compute_global_weight(corpus_bag, doc_bags)

	global_weights['cocoa']