Skip to content

Instantly share code, notes, and snippets.

@fannix
Created December 8, 2011 14:39
Show Gist options
  • Save fannix/1447153 to your computer and use it in GitHub Desktop.
Save fannix/1447153 to your computer and use it in GitHub Desktop.
nltk and sklearn
#!/usr/bin/env python2
import random
import nltk
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import movie_reviews
documents = [(movie_reviews.raw(fileid), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
def document_features(document):
"""docstring for document_features"""
return document
featuresets = [(document_features(d), c=="pos") for (d, c) in documents]
featuresets_X = [d for (d, c) in featuresets]
featuresets_Y = [c for (d, c) in featuresets]
train_set_X, test_set_X = featuresets_X[100:], featuresets_X[:100]
train_set_Y, test_set_Y = featuresets_Y[100:], featuresets_Y[:100]
vec = CountVectorizer()
train_mat = vec.fit_transform(train_set_X)
test_mat = vec.transform(test_set_X)
lr = LogisticRegression(C=0.1, penalty='l1')
train_dense = train_mat.todense()
lr.fit(train_dense, train_set_Y)
test_dense = test_mat.todense()
res = lr.predict(test_dense)
print np.mean(res == test_set_Y)
from operator import itemgetter
coef = lr.coef_[0]
coef_ind = [(i, coef[i]) for i in range(len(coef))]
coef_ind.sort(key = itemgetter(1))
voc = vec.vocabulary.items()
voc.sort(key = itemgetter(1))
k = 20
topk = [voc[i] for i, _ in coef_ind[-k:]]
print topk
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment