Skip to content

Instantly share code, notes, and snippets.

@halfak
Created April 22, 2020 14:38
Show Gist options
  • Save halfak/c39e12a63693b0adee735818fcbcc4c9 to your computer and use it in GitHub Desktop.
Save halfak/c39e12a63693b0adee735818fcbcc4c9 to your computer and use it in GitHub Desktop.
import time
import mwapi
import re
from deltas.tokenizers import wikitext_split
'''text = """
This is a sentence [[derp|link]].
Here is another paragraph with the number 10.
"""'''
session = mwapi.Session("https://en.wikipedia.org")
doc = session.get(action="query", prop="revisions",
titles="Alan Turing", rvprop="content", rvslots="main",
formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']
# Set up simple word extractor
devangari_word = r'\u0901-\u0963'
arabic_word = r'\u0601-\u061A' + \
r'\u061C-\u0669' + \
r'\u06D5-\u06EF'
bengali_word = r'\u0980-\u09FF'
combined_word = devangari_word + arabic_word + bengali_word
WORD_RE = r'([^\W\d]|[' + combined_word + r'])' + \
r'[\w' + combined_word + r']*' + \
r'([\'’]([\w' + combined_word + r']+|(?=($|\s))))*'
# Matches Chinese, Japanese and Korean characters.
CJK_RE = (
r'[' +
r'\u4E00-\u62FF' + # noqa Unified Ideographs
r'\u6300-\u77FF' +
r'\u7800-\u8CFF' +
r'\u8D00-\u9FCC' +
r'\u3400-\u4DFF' + # Unified Ideographs Ext A
r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B
r'\U00021600-\U000230FF' +
r'\U00023100-\U000245FF' +
r'\U00024600-\U000260FF' +
r'\U00026100-\U000275FF' +
r'\U00027600-\U000290FF' +
r'\U00029100-\U0002A6DF' +
r'\uF900-\uFAFF' + # Compatibility Ideographs
r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl.
r'\u3041-\u3096' + # Hiragana
r'\u30A0-\u30FF' + # Katakana
r'\u3400-\u4DB5' + # Kanji
r'\u4E00-\u9FCB' +
r'\uF900-\uFA6A' +
r'\u2E80-\u2FD5' + # Kanji radicals
r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width)
r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters
r'\u3220-\u3243' +
r'\u3280-\u337F'
r']'
)
WORD_OR_CJK_RE = re.compile(WORD_RE + "|" + CJK_RE)
# Run our tests
start = time.time()
for i in range(100):
list(wikitext_split.tokenize(text))
print("We can tokenize", 1/((time.time() - start)/100), "Alan Turing's per second")
print("Words we extract:", [str(m.group(0)) for m in WORD_OR_CJK_RE.finditer(text)][:100])
start = time.time()
for i in range(100):
list(WORD_OR_CJK_RE.finditer(text))
print("We can extract words from", 1/((time.time() - start)/100), "Alan Turing's per second")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment