Created
March 18, 2015 01:30
-
-
Save sente/b6ce27b1ae5f5ac1b969 to your computer and use it in GitHub Desktop.
sli.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import nltk | |
import numpy as np | |
import pandas as pd | |
import scipy | |
from nltk.corpus import stopwords | |
from scipy import linalg | |
stopwords = stopwords.words('english') | |
df = pd.read_csv('Reuters/r52-train-stemmed.txt', sep='\t', names=['cat', 'text'], index_col=False) | |
df.head() | |
docs = df['text'][0:2000] | |
print len(docs) | |
#print alksdjf | |
docs.head() | |
#term_indices, corpus_bag, doc_bags = find_frequencies(docs) | |
def find_frequencies(docs): | |
term_indices = {} ## This is #1 above | |
currentIndex = 0 ## This is the counter to make sure we correctly populate the term indices in order | |
corpus_bag = {} ## This is #2 above | |
doc_bags = [] ## This is the collection for #3 above | |
numdocs = len(list(docs.iteritems())) | |
for i, doc in docs.iteritems(): | |
print '%5d / %5s' % (int(i), numdocs) | |
doc_bag = {} ## This is the dictionary of term frequencies for the doc we're currently examining, doc_bags stores a collection of these | |
## TODO: Tokenize each document with nltk | |
doc_tokens = nltk.word_tokenize(doc) | |
## TODO: For each token in the current document: | |
for word in doc_tokens: | |
## Optionally ignore stopword and continue | |
## Throw out stopwords | |
##if word in stopwords: | |
## continue | |
## If the word is new (not in term_indices): | |
if word not in term_indices: | |
## add it to term_indices and give it the index value currentIndex, increment currentIndex | |
term_indices[word] = currentIndex | |
currentIndex += 1 | |
## add it to the corpus_bag with count 1 | |
corpus_bag[word] = 1 | |
## add it to the current doc_bag with count 1 | |
doc_bag[word] = 1 | |
## If the word is not new: | |
else: | |
## increment the corpus_bag | |
corpus_bag[word] = corpus_bag[word] + 1 | |
## If the word is already in the doc_bag, increment that counter, else set it to 1 | |
if word in doc_bag: | |
doc_bag[word] = doc_bag[word] + 1 | |
else: | |
doc_bag[word] = 1 | |
doc_bags.append(doc_bag) | |
return term_indices, corpus_bag, doc_bags | |
term_indices, corpus_bag, doc_bags = find_frequencies(docs) | |
#term_indices | |
#corpus_bag | |
#doc_bags | |
print len(term_indices) | |
print term_indices['cocoa'] | |
print len(corpus_bag) | |
print corpus_bag['cocoa'] | |
print len(doc_bags[0]) | |
print doc_bags[0] | |
## Implement a function that uses the corpus_bag and doc_bags found above to compute the global weighting (idf) term | |
def compute_global_weight(corpus_bag, doc_bags): | |
global_weights = {} ## A dictionary of term --> global weight (the idf components) using entropy weighting | |
## TODO: Define a variable logn which is the log base 2 of the number of documents in the set | |
logn = math.log(len(doc_bags), 2) | |
## TODO: For each term: | |
for i, term in enumerate(corpus_bag): | |
print i, len(corpus_bag) | |
## Start the global weight at 1 | |
global_weight = 1 | |
## Compute the global count from corpus_bag | |
global_count = corpus_bag[term] | |
## For each doc_bag: | |
for doc_bag in doc_bags: | |
## If the term is in it, calculate p_ij and decrease the global weight by p_ij * log(p_ij) / logn | |
if term in doc_bag: | |
local_count = doc_bag[term] + 0.0 | |
pij = local_count/global_count | |
global_weight += pij*math.log(pij,2)/logn | |
## Add this term's global weight to your global_weights dict | |
global_weights[term] = global_weight | |
return global_weights | |
global_weights = compute_global_weight(corpus_bag, doc_bags) | |
global_weights['cocoa'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment