Skip to content

Instantly share code, notes, and snippets.

@codez266
Created December 12, 2017 16:26
Show Gist options
  • Save codez266/e7d5c9ac6d7b9896b386615deb3c67db to your computer and use it in GitHub Desktop.
Save codez266/e7d5c9ac6d7b9896b386615deb3c67db to your computer and use it in GitHub Desktop.
sklearn model test
from revscoring.utilities import util
from revscoring.dependencies import solve
from revscoring.datasources import revision_oriented
import yamlconf
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
#features=hbow
observations = read_observations(open('enwiki.labeled_wikiprojects.w_text.json'))
label_name = 'mid_level_categories'
#value_labels = [(list(solve(features, cache=ob['cache'])), ob[label_name])
# for ob in observations]
#labels, lw, pr = util.read_labels_and_population_rates(None,
# None, None, 'labels-config.yaml')
#
#model = RandomForest(features, labels, multilabel=True)
#model.train(value_labels)
hv = HashingVectorizer()
mlb = MultiLabelBinarizer()
labels = []
X = []
for ob in observations:
labels.append(ob[label_name])
X.append(solve(english.stopwords.revision.datasources.non_stopwords,
cache={revision_oriented.revision.text: ob['text']}))
X[-1] = " ".join(X[-1])
label_matrix = mlb.fit_transform(labels)
print("Preprocessing done, classifying...")
clf = RandomForestClassifier()
t1 = time.time()
clf.fit(hv.transform(X), label_matrix)
t2 = time.time()
print("Time:{}".format(t2-t1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment