Skip to content

Instantly share code, notes, and snippets.

@mike-anderson
Last active December 30, 2015 15:59
Show Gist options
  • Save mike-anderson/7851290 to your computer and use it in GitHub Desktop.
Save mike-anderson/7851290 to your computer and use it in GitHub Desktop.
comment composition random forrest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import json
import numpy
data = json.load(open('askreddit.json'))
corpus = []
classifiers = []
for article_key in data:
corpus = corpus + list([x['body'].encode('utf-8') for x in data[article_key][1:]])
classifiers = classifiers + list([x['ups']-x['downs'] for x in data[article_key][1:]])
for position, score in enumerate(classifiers):
if score <= 0:
classifiers[position] = -1
elif score == 1:
classifiers[position] = 0
elif score > 1 and score <= 10:
classifiers[position] = 1
elif score > 10 and score <= 50:
classifiers[position] = 2
elif score > 50 and score <= 100:
classifiers[position] = 3
else:
classifiers[position] = 4
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1,2), min_df=1)
sample = vectorizer.fit_transform(corpus)
forest = RandomForestClassifier(n_estimators=10)
print cross_val_score(forest, sample.toarray(), numpy.array(classifiers))
forest.fit(sample.toarray(),numpy.array(classifiers))
print sorted(zip(forest.feature_importances_, vectorizer.get_feature_names()), reverse=True)[:25]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment