Created
April 22, 2020 14:38
-
-
Save halfak/c39e12a63693b0adee735818fcbcc4c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import mwapi | |
import re | |
from deltas.tokenizers import wikitext_split | |
'''text = """ | |
This is a sentence [[derp|link]]. | |
Here is another paragraph with the number 10. | |
"""''' | |
session = mwapi.Session("https://en.wikipedia.org") | |
doc = session.get(action="query", prop="revisions", | |
titles="Alan Turing", rvprop="content", rvslots="main", | |
formatversion=2) | |
text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content'] | |
# Set up simple word extractor | |
devangari_word = r'\u0901-\u0963' | |
arabic_word = r'\u0601-\u061A' + \ | |
r'\u061C-\u0669' + \ | |
r'\u06D5-\u06EF' | |
bengali_word = r'\u0980-\u09FF' | |
combined_word = devangari_word + arabic_word + bengali_word | |
WORD_RE = r'([^\W\d]|[' + combined_word + r'])' + \ | |
r'[\w' + combined_word + r']*' + \ | |
r'([\'’]([\w' + combined_word + r']+|(?=($|\s))))*' | |
# Matches Chinese, Japanese and Korean characters. | |
CJK_RE = ( | |
r'[' + | |
r'\u4E00-\u62FF' + # noqa Unified Ideographs | |
r'\u6300-\u77FF' + | |
r'\u7800-\u8CFF' + | |
r'\u8D00-\u9FCC' + | |
r'\u3400-\u4DFF' + # Unified Ideographs Ext A | |
r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B | |
r'\U00021600-\U000230FF' + | |
r'\U00023100-\U000245FF' + | |
r'\U00024600-\U000260FF' + | |
r'\U00026100-\U000275FF' + | |
r'\U00027600-\U000290FF' + | |
r'\U00029100-\U0002A6DF' + | |
r'\uF900-\uFAFF' + # Compatibility Ideographs | |
r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl. | |
r'\u3041-\u3096' + # Hiragana | |
r'\u30A0-\u30FF' + # Katakana | |
r'\u3400-\u4DB5' + # Kanji | |
r'\u4E00-\u9FCB' + | |
r'\uF900-\uFA6A' + | |
r'\u2E80-\u2FD5' + # Kanji radicals | |
r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width) | |
r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters | |
r'\u3220-\u3243' + | |
r'\u3280-\u337F' | |
r']' | |
) | |
WORD_OR_CJK_RE = re.compile(WORD_RE + "|" + CJK_RE) | |
# Run our tests | |
start = time.time() | |
for i in range(100): | |
list(wikitext_split.tokenize(text)) | |
print("We can tokenize", 1/((time.time() - start)/100), "Alan Turing's per second") | |
print("Words we extract:", [str(m.group(0)) for m in WORD_OR_CJK_RE.finditer(text)][:100]) | |
start = time.time() | |
for i in range(100): | |
list(WORD_OR_CJK_RE.finditer(text)) | |
print("We can extract words from", 1/((time.time() - start)/100), "Alan Turing's per second") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment