Created
December 12, 2017 16:26
-
-
Save codez266/e7d5c9ac6d7b9896b386615deb3c67db to your computer and use it in GitHub Desktop.
sklearn model test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from revscoring.utilities import util | |
from revscoring.dependencies import solve | |
from revscoring.datasources import revision_oriented | |
import yamlconf | |
from sklearn.feature_extraction.text import HashingVectorizer | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.preprocessing import MultiLabelBinarizer | |
#features=hbow | |
observations = read_observations(open('enwiki.labeled_wikiprojects.w_text.json')) | |
label_name = 'mid_level_categories' | |
#value_labels = [(list(solve(features, cache=ob['cache'])), ob[label_name]) | |
# for ob in observations] | |
#labels, lw, pr = util.read_labels_and_population_rates(None, | |
# None, None, 'labels-config.yaml') | |
# | |
#model = RandomForest(features, labels, multilabel=True) | |
#model.train(value_labels) | |
hv = HashingVectorizer() | |
mlb = MultiLabelBinarizer() | |
labels = [] | |
X = [] | |
for ob in observations: | |
labels.append(ob[label_name]) | |
X.append(solve(english.stopwords.revision.datasources.non_stopwords, | |
cache={revision_oriented.revision.text: ob['text']})) | |
X[-1] = " ".join(X[-1]) | |
label_matrix = mlb.fit_transform(labels) | |
print("Preprocessing done, classifying...") | |
clf = RandomForestClassifier() | |
t1 = time.time() | |
clf.fit(hv.transform(X), label_matrix) | |
t2 = time.time() | |
print("Time:{}".format(t2-t1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment