Requires python-virtualenv: http://packages.ubuntu.com/trusty/python/python-virtualenv
In this sequence, I'm going to assume that python 3.4 is the installed verison.
$ cd ~
$ mkdir env
$ cd env
Requires python-virtualenv: http://packages.ubuntu.com/trusty/python/python-virtualenv
In this sequence, I'm going to assume that python 3.4 is the installed verison.
$ cd ~
$ mkdir env
$ cd env
{ | |
" Culture and the arts": { | |
"name": "Culture and the arts", | |
"root_url": "Wikipedia:WikiProject_Council/Directory", | |
"index": "2", | |
"url": "Wikipedia:WikiProject_Council/Directory/Culture", | |
"topics": { | |
"Culture and the arts": { | |
"name": "Culture and the arts", | |
"root_url": "Wikipedia:WikiProject_Council/Directory/Culture", |
from revscoring.features import wikitext | |
from revscoring.features.modifiers import max, sub | |
from revscoring.languages import english | |
from revscoring import Feature | |
from revscoring.features import FeatureVector | |
from revscoring.datasources import Datasource, revision_oriented | |
from revscoring.dependencies import solve | |
from gensim.models.keyedvectors import KeyedVectors | |
import numpy as np |
from revscoring.languages import english | |
from revscoring.datasources.meta import (frequencies, gramming, hashing, | |
mappers) | |
import numpy as np | |
from revscoring import Feature | |
from revscoring.features import FeatureVector, wikitext | |
grams = [(0,), (0, 1), (0, 2)] | |
hashed_bow = frequencies.table( |
from revscoring.utilities import util | |
from revscoring.dependencies import solve | |
from revscoring.datasources import revision_oriented | |
import yamlconf | |
from sklearn.feature_extraction.text import HashingVectorizer | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.preprocessing import MultiLabelBinarizer | |
#features=hbow |
Wed Dec 13 04:56:17 2017 stats [93/1765] | |
3844350339 function calls (3843797676 primitive calls) in 30453.783 seconds | |
Ordered by: cumulative time | |
List reduced from 260 to 50 due to restriction <50> | |
ncalls tottime percall cumtime percall filename:lineno(function) | |
1 0.000 0.000 30453.783 30453.783 /home/codezee/ai/venv/lib/python3.4/site-packages/revscoring-2.0.11-py3.4.egg/revscoring/scoring/models/model.py:209(cro$ | |
s_validate) |
(venv)codezee@ores-misc-01:/home/codezee/ai/drafttopic git:(extract-from-text*) $ revscoring tune config/gb.params.yaml drafttopic.feature_lists.w2v.drafttopic mid_leve | |
l_categories accuracy.macro --debug --observations=enwiki.labeled_wikiprojects.w_cache.json --verbose --multilabel --labels-config=fulllabels-config.yaml --folds=3 | |
2017-12-17 13:25:13,512 DEBUG:gensim.models.doc2vec -- Fast version of gensim.models.doc2vec is being used | |
2017-12-17 13:25:13,527 DEBUG:gensim.models.fasttext -- Fast version of Fasttext is being used | |
2017-12-17 13:25:13,530 INFO:summa.preprocessing.cleaner -- 'pattern' package not found; tag filters are not available for English | |
2017-12-17 13:25:13,539 INFO:revscoring.utilities.tune -- Reading feature values & labels... | |
2017-12-17 13:26:49,555 DEBUG:revscoring.utilities.tune -- Starting up multiprocessing pool (processes=8) | |
# Model tuning report | |
- Revscoring version: 2.0.11 | |
- Features: drafttopic.feature_lists.w2v.drafttopic |
def score_many(self, feature_values): | |
# Re-vectorize features -- this expands/flattens sub-FeatureVectors | |
fv_vectors = [vectorize_values(fv) for fv in feature_values] | |
# Scale and transform (if applicable) | |
scaled_fv_vectors = self.fit_scaler_and_transform(fv_vectors) | |
Statistics: [319/1813] | |
counts (n=93415): | |
label n TP FP FN TN | |
--------------------------------------------- ----- --- ---- ----- ---- ----- | |
'Culture.Philosophy and religion' 4116 --> 0 4116 0 89299 | |
'Culture.Plastic arts' 3924 --> 0 3924 0 89491 | |
'STEM.Geosciences' 2085 --> 0 2085 0 91330 | |
'Geography.Bodies of water' 2336 --> 0 2336 0 91079 | |
'Geography.Countries' 25576 --> 0 25576 0 67839 | |
'History_And_Society.Business and economics' |
counts (n=10000): | |
label n TP FP FN TN | |
--------------------------------------------- ---- --- ---- ---- ---- ---- | |
'STEM.Time' 270 --> 200 70 155 9575 | |
'STEM.Physics' 284 --> 245 39 554 9162 | |
'STEM.Space' 251 --> 229 22 111 9638 | |
'STEM.Mathematics' 164 --> 133 31 283 9553 | |
'Culture.Crafts and hobbies' 232 --> 156 76 44 9724 | |
'History_And_Society.Transportation' 469 --> 389 80 155 9376 | |
'Geography.Maps' 287 --> 217 70 564 9149 |