Created
December 8, 2011 14:39
-
-
Save fannix/1447153 to your computer and use it in GitHub Desktop.
nltk and sklearn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import random | |
import nltk | |
from sklearn.linear_model import LogisticRegression | |
import numpy as np | |
from sklearn.feature_extraction.text import CountVectorizer | |
from nltk.corpus import movie_reviews | |
documents = [(movie_reviews.raw(fileid), category) | |
for category in movie_reviews.categories() | |
for fileid in movie_reviews.fileids(category)] | |
random.shuffle(documents) | |
def document_features(document): | |
"""docstring for document_features""" | |
return document | |
featuresets = [(document_features(d), c=="pos") for (d, c) in documents] | |
featuresets_X = [d for (d, c) in featuresets] | |
featuresets_Y = [c for (d, c) in featuresets] | |
train_set_X, test_set_X = featuresets_X[100:], featuresets_X[:100] | |
train_set_Y, test_set_Y = featuresets_Y[100:], featuresets_Y[:100] | |
vec = CountVectorizer() | |
train_mat = vec.fit_transform(train_set_X) | |
test_mat = vec.transform(test_set_X) | |
lr = LogisticRegression(C=0.1, penalty='l1') | |
train_dense = train_mat.todense() | |
lr.fit(train_dense, train_set_Y) | |
test_dense = test_mat.todense() | |
res = lr.predict(test_dense) | |
print np.mean(res == test_set_Y) | |
from operator import itemgetter | |
coef = lr.coef_[0] | |
coef_ind = [(i, coef[i]) for i in range(len(coef))] | |
coef_ind.sort(key = itemgetter(1)) | |
voc = vec.vocabulary.items() | |
voc.sort(key = itemgetter(1)) | |
k = 20 | |
topk = [voc[i] for i, _ in coef_ind[-k:]] | |
print topk |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment