Skip to content

Instantly share code, notes, and snippets.

@sente
Created March 18, 2015 01:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sente/b6ce27b1ae5f5ac1b969 to your computer and use it in GitHub Desktop.
Save sente/b6ce27b1ae5f5ac1b969 to your computer and use it in GitHub Desktop.
sli.py
import math
import nltk
import numpy as np
import pandas as pd
import scipy
from nltk.corpus import stopwords
from scipy import linalg
stopwords = stopwords.words('english')
df = pd.read_csv('Reuters/r52-train-stemmed.txt', sep='\t', names=['cat', 'text'], index_col=False)
df.head()
docs = df['text'][0:2000]
print len(docs)
#print alksdjf
docs.head()
#term_indices, corpus_bag, doc_bags = find_frequencies(docs)
def find_frequencies(docs):
term_indices = {} ## This is #1 above
currentIndex = 0 ## This is the counter to make sure we correctly populate the term indices in order
corpus_bag = {} ## This is #2 above
doc_bags = [] ## This is the collection for #3 above
numdocs = len(list(docs.iteritems()))
for i, doc in docs.iteritems():
print '%5d / %5s' % (int(i), numdocs)
doc_bag = {} ## This is the dictionary of term frequencies for the doc we're currently examining, doc_bags stores a collection of these
## TODO: Tokenize each document with nltk
doc_tokens = nltk.word_tokenize(doc)
## TODO: For each token in the current document:
for word in doc_tokens:
## Optionally ignore stopword and continue
## Throw out stopwords
##if word in stopwords:
## continue
## If the word is new (not in term_indices):
if word not in term_indices:
## add it to term_indices and give it the index value currentIndex, increment currentIndex
term_indices[word] = currentIndex
currentIndex += 1
## add it to the corpus_bag with count 1
corpus_bag[word] = 1
## add it to the current doc_bag with count 1
doc_bag[word] = 1
## If the word is not new:
else:
## increment the corpus_bag
corpus_bag[word] = corpus_bag[word] + 1
## If the word is already in the doc_bag, increment that counter, else set it to 1
if word in doc_bag:
doc_bag[word] = doc_bag[word] + 1
else:
doc_bag[word] = 1
doc_bags.append(doc_bag)
return term_indices, corpus_bag, doc_bags
term_indices, corpus_bag, doc_bags = find_frequencies(docs)
#term_indices
#corpus_bag
#doc_bags
print len(term_indices)
print term_indices['cocoa']
print len(corpus_bag)
print corpus_bag['cocoa']
print len(doc_bags[0])
print doc_bags[0]
## Implement a function that uses the corpus_bag and doc_bags found above to compute the global weighting (idf) term
def compute_global_weight(corpus_bag, doc_bags):
global_weights = {} ## A dictionary of term --> global weight (the idf components) using entropy weighting
## TODO: Define a variable logn which is the log base 2 of the number of documents in the set
logn = math.log(len(doc_bags), 2)
## TODO: For each term:
for i, term in enumerate(corpus_bag):
print i, len(corpus_bag)
## Start the global weight at 1
global_weight = 1
## Compute the global count from corpus_bag
global_count = corpus_bag[term]
## For each doc_bag:
for doc_bag in doc_bags:
## If the term is in it, calculate p_ij and decrease the global weight by p_ij * log(p_ij) / logn
if term in doc_bag:
local_count = doc_bag[term] + 0.0
pij = local_count/global_count
global_weight += pij*math.log(pij,2)/logn
## Add this term's global weight to your global_weights dict
global_weights[term] = global_weight
return global_weights
global_weights = compute_global_weight(corpus_bag, doc_bags)
global_weights['cocoa']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment