Aaron Halfaker halfak

## demo_tokenize.py
import mwapi
from deltas.tokenizers import wikitext_split

rev_id = 57246316

session = mwapi.Session("https://ja.wikipedia.org")
doc = session.get(action="query", prop="revisions", revids=[rev_id], rvslots="main", rvprop="content", formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']

location = 0

## ukwiki_classes.py
NORMALIZED_LABELS = {
  "ВС": ["ВС", "вс", "Вибрана стаття", "вибрана стаття"],
  "ДС":	["ДС", "дс", "Добра стаття", "добра стаття"],
  "I":	["I", "1"],
  "II":	["II", "2"],
  "III": ["III", "3"],
  "IV":	["IV", "4", "Stub", "stub"]
}
LABEL_MAP = {observed_label: normalized_label
             for normalized_label, observed_labels in NORMALIZED_LABELS.items()

## git.diff
$ git diff
diff --git a/revscoring/languages/features/matches/regex_matches.py b/revscoring/languages/features/matches/regex_matches.py
index 767ee20..5070dd0 100644
--- a/revscoring/languages/features/matches/regex_matches.py
+++ b/revscoring/languages/features/matches/regex_matches.py
@@ -1,3 +1,10 @@
+"""
+Implements a feature set based off of list of regular expressions to match.
+
+.. autoclass:: revscoring.languages.features.RegexMatches^M

## test_matches.py
$ python
Python 3.7.7 (default, May 19 2020, 15:54:35)
[GCC 5.4.0 20160609] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from revscoring.languages.features import RegexMatches, SubstringMatches
>>> from revscoring.datasources import revision_oriented as ro
>>> from revscoring.dependencies import solve
>>> rm = RegexMatches("Foo", ["bad", "badword"])
>>> solve(rm.revision.diff.matches_added, cache={ro.revision.text: "I have some bad words don't you know?", ro.revision.parent.text: "I have some badword words just so you bad."})
1.0

## weighted_sum.py
"""
``$ articlequality weighted_sum -h``
::
    Extracts probabilities assigned to each class from the output of
    revscoring score utility and outputs the weighted sum of the article
    quality predicted where each class is represented as a weight sorted
    in a yaml config file.

    Usage:
        weighted_sum <weights> [--scores=<path>] [--output=<path>]

## import_ptwiki.py
>>> import yamlconf
>>> yamlconf.load(open("ptwiki.yaml"))
{1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}

## gist:a497ea370eabc93a66428e7b78996435
$ python
Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
[GCC 5.3.1 20160330] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import requests
>>> import mwapi
>>> session = mwapi.Session("https://deployment.wikimedia.beta.wmflabs.org/")
Sending requests with default User-Agent.  Set 'user_agent' on mwapi.Session to quiet this message.
>>> session.get(action='query', meta="tokens", format="json")
{'query': {'tokens': {'csrftoken': '+\\'}}, 'batchcomplete': ''}

## cmd.bash
$ bzcat datasets/ptwiki.draft_quality.balanced_3k.with_text.json.bz2 | \
  shuf -n 100 | python demo_ptwiki_w2w.py | sort -k1,1

## gist:53203c62f54dd9b83a4f2abc293b8534
$ python
Python 3.5.3 (default, Sep 27 2018, 17:25:39)
[GCC 6.3.0 20170516] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from revscoring import Model
>>> model = Model.load(open("models/ptwiki.wp10.gradient_boosting.model"))
>>> importance_features = list(sorted(zip(model.estimator.feature_importances_, model.features), reverse=True))
>>> for importance, feature in importance_features:
...   print(round(importance, 3), feature)
...

## demo_simple_words_vs_tokenization.py
import time

import mwapi
import re
from deltas.tokenizers import wikitext_split

'''text = """
This is a sentence [[derp|link]].

Here is another paragraph with the number 10.
	import mwapi
	from deltas.tokenizers import wikitext_split

	rev_id = 57246316

	session = mwapi.Session("https://ja.wikipedia.org")
	doc = session.get(action="query", prop="revisions", revids=[rev_id], rvslots="main", rvprop="content", formatversion=2)
	text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']

	location = 0
	NORMALIZED_LABELS = {
	"ВС": ["ВС", "вс", "Вибрана стаття", "вибрана стаття"],
	"ДС": ["ДС", "дс", "Добра стаття", "добра стаття"],
	"I": ["I", "1"],
	"II": ["II", "2"],
	"III": ["III", "3"],
	"IV": ["IV", "4", "Stub", "stub"]
	}
	LABEL_MAP = {observed_label: normalized_label
	for normalized_label, observed_labels in NORMALIZED_LABELS.items()
	$ git diff
	diff --git a/revscoring/languages/features/matches/regex_matches.py b/revscoring/languages/features/matches/regex_matches.py
	index 767ee20..5070dd0 100644
	--- a/revscoring/languages/features/matches/regex_matches.py
	+++ b/revscoring/languages/features/matches/regex_matches.py
	@@ -1,3 +1,10 @@
	+"""
	+Implements a feature set based off of list of regular expressions to match.
	+
	+.. autoclass:: revscoring.languages.features.RegexMatches^M
	$ python
	Python 3.7.7 (default, May 19 2020, 15:54:35)
	[GCC 5.4.0 20160609] on linux
	Type "help", "copyright", "credits" or "license" for more information.
	>>> from revscoring.languages.features import RegexMatches, SubstringMatches
	>>> from revscoring.datasources import revision_oriented as ro
	>>> from revscoring.dependencies import solve
	>>> rm = RegexMatches("Foo", ["bad", "badword"])
	>>> solve(rm.revision.diff.matches_added, cache={ro.revision.text: "I have some bad words don't you know?", ro.revision.parent.text: "I have some badword words just so you bad."})
	1.0
	"""
	``$ articlequality weighted_sum -h``
	::
	Extracts probabilities assigned to each class from the output of
	revscoring score utility and outputs the weighted sum of the article
	quality predicted where each class is represented as a weight sorted
	in a yaml config file.

	Usage:
	weighted_sum <weights> [--scores=<path>] [--output=<path>]
	>>> import yamlconf
	>>> yamlconf.load(open("ptwiki.yaml"))
	{1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
	$ python
	Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
	[GCC 5.3.1 20160330] on linux
	Type "help", "copyright", "credits" or "license" for more information.
	>>> import requests
	>>> import mwapi
	>>> session = mwapi.Session("https://deployment.wikimedia.beta.wmflabs.org/")
	Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
	>>> session.get(action='query', meta="tokens", format="json")
	{'query': {'tokens': {'csrftoken': '+\\'}}, 'batchcomplete': ''}
	$ bzcat datasets/ptwiki.draft_quality.balanced_3k.with_text.json.bz2 \| \
	shuf -n 100 \| python demo_ptwiki_w2w.py \| sort -k1,1
	$ python
	Python 3.5.3 (default, Sep 27 2018, 17:25:39)
	[GCC 6.3.0 20170516] on linux
	Type "help", "copyright", "credits" or "license" for more information.
	>>> from revscoring import Model
	>>> model = Model.load(open("models/ptwiki.wp10.gradient_boosting.model"))
	>>> importance_features = list(sorted(zip(model.estimator.feature_importances_, model.features), reverse=True))
	>>> for importance, feature in importance_features:
	... print(round(importance, 3), feature)
	...
	import time

	import mwapi
	import re
	from deltas.tokenizers import wikitext_split

	'''text = """
	This is a sentence [[derp\|link]].

	Here is another paragraph with the number 10.