Skip to content

Instantly share code, notes, and snippets.

View halfak's full-sized avatar

Aaron Halfaker halfak

View GitHub Profile
$ python
Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
[GCC 5.3.1 20160330] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from deltas.tokenizers import wikitext_split
>>> wikitext_split.regex.pattern
"(?P<comment_start><!--)|(?P<comment_end>-->)|(?P<url>((bitcoin|geo|magnet|mailto|news|sips?|tel|urn)\\:|((|ftp|ftps|git|gopher|https?|ircs?|mms|nntp|redis|sftp|ssh|svn|telnet|worldwind|xmpp)\\:)?\\/\\/)[^\\s/$.?#].[^\\s]*)|(?P<entity>&[a-z][a-z0-9]*;)|(?P<cjk>[\\u4E00-\\u62FF\\u6300-\\u77FF\\u7800-\\u8CFF\\u8D00-\\u9FCC\\u3400-\\u4DFF\\U00020000-\\U000215FF\\U00021600-\\U000230FF\\U00023100-\\U000245FF\\U00024600-\\U000260FF\\U00026100-\\U000275FF\\U00027600-\\U000290FF\\U00029100-\\U0002A6DF\\uF900-\\uFAFF\\U0002F800-\\U0002FA1F\\u3041-\\u3096\\u30A0-\\u30FF\\u3400-\\u4DB5\\u4E00-\\u9FCB\\uF900-\\uFA6A\\u2E80-\\u2FD5\\uFF5F-\\uFF9F\\u31F0-\\u31FF\\u3220-\\u3243\\u3280-\\u337F])|(?P<ref_open><ref\\b[^>/]*>)|(?P<ref_close></ref\\b[^>]*>)|(?P<ref_singleton><ref\\b[^>/]*/>)|(
0 - Biography
1 - Sports
2 - etc.
[1,1,0,0,0,0,0,0] - Athlete biography
[0,0,1,0,0,0,0,0]
64 UBIGINT - Unsigned INT
import time
import mwapi
import textstat
session = mwapi.Session("https://en.wikipedia.org", user_agent="ahalfaker@wikimedia.org")
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', rvslots="main", formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']
start = time.time()
print("flesch_reading_ease", textstat.flesch_reading_ease(text))
import logging
import re
from .extractor import TemplateExtractor
logger = logging.getLogger(__name__)
def from_template(template):
project_name = normalize_project_name(template.name)
$ python
Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
[GCC 5.3.1 20160330] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import re
>>> labels_re = r"([^\|\{\{\}\}]+)\|([0-5*])"
>>> my_template = "{{Marca de projeto|3|Biografias|4|Políticos|4|Brasil|3|WP Offline|2|bot=4/20111127|rev=20170714}}"
>>> [(m.group(1), m.group(2)) for m in re.finditer(labels_re, my_template)]
[('Marca de projeto', '3'), ('Biografias', '4'), ('Políticos', '4'), ('Brasil', '3'), ('WP Offline', '2')]
import mwparsefromhell
example = """
{{foo bar baz}}
{{I am a random template|7|Foo bar|8}}
{{Marca de projeto|3|Biografias|4|Políticos|4|Brasil|3|WP Offline|2|bot=4/20111127|rev=20170714}}"""
templates = list(mwparserfromhell.parse(example_text).filter_templates())
def from_template(template):
@halfak
halfak / demo_idioms_performance.py
Last active March 23, 2020 19:34
Extract count of idioms for Alan Turing
import time
import mwapi
from revscoring.dependencies import solve
from revscoring.languages import english
from articlequality.feature_lists import enwiki
session = mwapi.Session("https://en.wikipedia.org")
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['content']
import time
import mwapi
from revscoring.dependencies import solve
from revscoring.features import wikitext
from articlequality.feature_lists.enwiki import text_complexity
session = mwapi.Session("https://en.wikipedia.org")
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['content']
min_section_flesch = aggregators.min(
section_flesches,
name="wikitext.revisions.sections.min_flesch")
max_section_flesch = aggregators.max(
section_flesches,
name="wikitext.revisions.sections.max_flesch")
mean_section_flesch = aggregators.mean(
section_flesches,
name="wikitext.revisions.sections.mean_flesch")
Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
[GCC 5.3.1 20160330] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import mwapi
>>> from revscoring.languages import english
>>> from revscoring.dependencies import solve
>>> doc = mwapi.Session("https://en.wikipedia.org").get(action="query", prop="revisions", titles="Alan Turing", rvprop="content", formatversion=2)
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
The following query raised warnings: {'format': 'json', 'prop': 'revisions', 'rvprop': 'content', 'titles': 'Alan Turing', 'formatversion': 2, 'action': 'query'}
- revisions -- {'warnings': 'Because "rvslots" was not specified, a legacy format has been used for the output. This format is deprecated, and in the future the new format will always be used.'}