Skip to content

Instantly share code, notes, and snippets.

View halfak's full-sized avatar

Aaron Halfaker halfak

View GitHub Profile
@halfak
halfak / demo_tokenize.py
Created June 1, 2020 16:11
Tokenize stuck on japanese revision
import mwapi
from deltas.tokenizers import wikitext_split
rev_id = 57246316
session = mwapi.Session("https://ja.wikipedia.org")
doc = session.get(action="query", prop="revisions", revids=[rev_id], rvslots="main", rvprop="content", formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']
location = 0
NORMALIZED_LABELS = {
"ВС": ["ВС", "вс", "Вибрана стаття", "вибрана стаття"],
"ДС": ["ДС", "дс", "Добра стаття", "добра стаття"],
"I": ["I", "1"],
"II": ["II", "2"],
"III": ["III", "3"],
"IV": ["IV", "4", "Stub", "stub"]
}
LABEL_MAP = {observed_label: normalized_label
for normalized_label, observed_labels in NORMALIZED_LABELS.items()
$ git diff
diff --git a/revscoring/languages/features/matches/regex_matches.py b/revscoring/languages/features/matches/regex_matches.py
index 767ee20..5070dd0 100644
--- a/revscoring/languages/features/matches/regex_matches.py
+++ b/revscoring/languages/features/matches/regex_matches.py
@@ -1,3 +1,10 @@
+"""
+Implements a feature set based off of list of regular expressions to match.
+
+.. autoclass:: revscoring.languages.features.RegexMatches^M
$ python
Python 3.7.7 (default, May 19 2020, 15:54:35)
[GCC 5.4.0 20160609] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from revscoring.languages.features import RegexMatches, SubstringMatches
>>> from revscoring.datasources import revision_oriented as ro
>>> from revscoring.dependencies import solve
>>> rm = RegexMatches("Foo", ["bad", "badword"])
>>> solve(rm.revision.diff.matches_added, cache={ro.revision.text: "I have some bad words don't you know?", ro.revision.parent.text: "I have some badword words just so you bad."})
1.0
"""
``$ articlequality weighted_sum -h``
::
Extracts probabilities assigned to each class from the output of
revscoring score utility and outputs the weighted sum of the article
quality predicted where each class is represented as a weight sorted
in a yaml config file.
Usage:
weighted_sum <weights> [--scores=<path>] [--output=<path>]
>>> import yamlconf
>>> yamlconf.load(open("ptwiki.yaml"))
{1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
$ python
Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
[GCC 5.3.1 20160330] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import requests
>>> import mwapi
>>> session = mwapi.Session("https://deployment.wikimedia.beta.wmflabs.org/")
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
>>> session.get(action='query', meta="tokens", format="json")
{'query': {'tokens': {'csrftoken': '+\\'}}, 'batchcomplete': ''}
@halfak
halfak / cmd.bash
Created April 28, 2020 14:31
Sample of labels and words_to_watch
$ bzcat datasets/ptwiki.draft_quality.balanced_3k.with_text.json.bz2 | \
shuf -n 100 | python demo_ptwiki_w2w.py | sort -k1,1
$ python
Python 3.5.3 (default, Sep 27 2018, 17:25:39)
[GCC 6.3.0 20170516] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from revscoring import Model
>>> model = Model.load(open("models/ptwiki.wp10.gradient_boosting.model"))
>>> importance_features = list(sorted(zip(model.estimator.feature_importances_, model.features), reverse=True))
>>> for importance, feature in importance_features:
... print(round(importance, 3), feature)
...
import time
import mwapi
import re
from deltas.tokenizers import wikitext_split
'''text = """
This is a sentence [[derp|link]].
Here is another paragraph with the number 10.