halfak/demo_simple_words_vs_tokenization.py

## demo_simple_words_vs_tokenization.py
import time

import mwapi
import re
from deltas.tokenizers import wikitext_split

'''text = """
This is a sentence [[derp|link]].

Here is another paragraph with the number 10.
"""'''

session = mwapi.Session("https://en.wikipedia.org")
doc = session.get(action="query", prop="revisions",
                  titles="Alan Turing", rvprop="content", rvslots="main",
                  formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']


# Set up simple word extractor
devangari_word = r'\u0901-\u0963'
arabic_word = r'\u0601-\u061A' + \
              r'\u061C-\u0669' + \
              r'\u06D5-\u06EF'
bengali_word = r'\u0980-\u09FF'
combined_word = devangari_word + arabic_word + bengali_word

WORD_RE = r'([^\W\d]|[' + combined_word + r'])' + \
          r'[\w' + combined_word + r']*' + \
          r'([\'’]([\w' + combined_word + r']+|(?=($|\s))))*'

# Matches Chinese, Japanese and Korean characters.
CJK_RE = (
    r'[' +
        r'\u4E00-\u62FF' +  # noqa Unified Ideographs
            r'\u6300-\u77FF' +
            r'\u7800-\u8CFF' +
            r'\u8D00-\u9FCC' +
        r'\u3400-\u4DFF' +  # Unified Ideographs Ext A
        r'\U00020000-\U000215FF' +  # Unified Ideographs Ext. B
            r'\U00021600-\U000230FF' +
            r'\U00023100-\U000245FF' +
            r'\U00024600-\U000260FF' +
            r'\U00026100-\U000275FF' +
            r'\U00027600-\U000290FF' +
            r'\U00029100-\U0002A6DF' +
        r'\uF900-\uFAFF' +  # Compatibility Ideographs
        r'\U0002F800-\U0002FA1F' +  # Compatibility Ideographs Suppl.
        r'\u3041-\u3096' +  # Hiragana
        r'\u30A0-\u30FF' +  # Katakana
        r'\u3400-\u4DB5' +  # Kanji
            r'\u4E00-\u9FCB' +
            r'\uF900-\uFA6A' +
        r'\u2E80-\u2FD5' +  # Kanji radicals
        r'\uFF5F-\uFF9F' +  # Katakana and Punctuation (Half Width)
        r'\u31F0-\u31FF' +  # Miscellaneous Japanese Symbols and Characters
            r'\u3220-\u3243' +
            r'\u3280-\u337F'
    r']'
)
WORD_OR_CJK_RE = re.compile(WORD_RE + "|" + CJK_RE)

# Run our tests

start = time.time()
for i in range(100):
    list(wikitext_split.tokenize(text))
print("We can tokenize", 1/((time.time() - start)/100), "Alan Turing's per second")


print("Words we extract:", [str(m.group(0)) for m in WORD_OR_CJK_RE.finditer(text)][:100])
start = time.time()
for i in range(100):
    list(WORD_OR_CJK_RE.finditer(text))
print("We can extract words from", 1/((time.time() - start)/100), "Alan Turing's per second")
	import time

	import mwapi
	import re
	from deltas.tokenizers import wikitext_split

	'''text = """
	This is a sentence [[derp\|link]].

	Here is another paragraph with the number 10.
	"""'''

	session = mwapi.Session("https://en.wikipedia.org")
	doc = session.get(action="query", prop="revisions",
	titles="Alan Turing", rvprop="content", rvslots="main",
	formatversion=2)
	text = doc['query']['pages'][0]['revisions'][0]['slots']['main']['content']


	# Set up simple word extractor
	devangari_word = r'\u0901-\u0963'
	arabic_word = r'\u0601-\u061A' + \
	r'\u061C-\u0669' + \
	r'\u06D5-\u06EF'
	bengali_word = r'\u0980-\u09FF'
	combined_word = devangari_word + arabic_word + bengali_word

	WORD_RE = r'([^\W\d]\|[' + combined_word + r'])' + \
	r'[\w' + combined_word + r']*' + \
	r'([\'’]([\w' + combined_word + r']+\|(?=($\|\s))))*'

	# Matches Chinese, Japanese and Korean characters.
	CJK_RE = (
	r'[' +
	r'\u4E00-\u62FF' + # noqa Unified Ideographs
	r'\u6300-\u77FF' +
	r'\u7800-\u8CFF' +
	r'\u8D00-\u9FCC' +
	r'\u3400-\u4DFF' + # Unified Ideographs Ext A
	r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B
	r'\U00021600-\U000230FF' +
	r'\U00023100-\U000245FF' +
	r'\U00024600-\U000260FF' +
	r'\U00026100-\U000275FF' +
	r'\U00027600-\U000290FF' +
	r'\U00029100-\U0002A6DF' +
	r'\uF900-\uFAFF' + # Compatibility Ideographs
	r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl.
	r'\u3041-\u3096' + # Hiragana
	r'\u30A0-\u30FF' + # Katakana
	r'\u3400-\u4DB5' + # Kanji
	r'\u4E00-\u9FCB' +
	r'\uF900-\uFA6A' +
	r'\u2E80-\u2FD5' + # Kanji radicals
	r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width)
	r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters
	r'\u3220-\u3243' +
	r'\u3280-\u337F'
	r']'
	)
	WORD_OR_CJK_RE = re.compile(WORD_RE + "\|" + CJK_RE)

	# Run our tests

	start = time.time()
	for i in range(100):
	list(wikitext_split.tokenize(text))
	print("We can tokenize", 1/((time.time() - start)/100), "Alan Turing's per second")


	print("Words we extract:", [str(m.group(0)) for m in WORD_OR_CJK_RE.finditer(text)][:100])
	start = time.time()
	for i in range(100):
	list(WORD_OR_CJK_RE.finditer(text))
	print("We can extract words from", 1/((time.time() - start)/100), "Alan Turing's per second")