Created May 6, 2020 00:48
# Extracts the text used for the performance test
import time
import mwapi
session = mwapi.Session("")
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['content']
# Functions for tokenization
import json
from deltas.tokenizers import wikitext_split as ws
from deltas.tokenizers.wikitext_split import RegexTokenizer as RT
PLAIN_PROTO = [r'bitcoin', r'geo', r'magnet', r'mailto', r'news', r'sips?',
r'tel', r'urn']
SLASHED_PROTO = [r'', r'ftp', r'ftps', r'git', r'gopher', r'https?', r'ircs?',
r'mms', r'nntp', r'redis', r'sftp', r'ssh', r'svn', r'telnet',
r'worldwind', r'xmpp']
ADDRESS = r'[^\s/$.?#].[^\s]*'
url = (
r'(' + # noqa
r'(' + '|'.join(PLAIN_PROTO) + r')\:|' + # noqa
r'((' + '|'.join(SLASHED_PROTO) + r')\:)?\/\/' +
r')' + ADDRESS
# re.compile(url, re.U).match("")
# Matches Chinese, Japanese and Korean characters.
cjk = (
r'[' +
r'\u4E00-\u62FF' + # noqa Unified Ideographs
r'\u6300-\u77FF' +
r'\u7800-\u8CFF' +
r'\u8D00-\u9FCC' +
r'\u3400-\u4DFF' + # Unified Ideographs Ext A
r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B
r'\U00021600-\U000230FF' +
r'\U00023100-\U000245FF' +
r'\U00024600-\U000260FF' +
r'\U00026100-\U000275FF' +
r'\U00027600-\U000290FF' +
r'\U00029100-\U0002A6DF' +
r'\uF900-\uFAFF' + # Compatibility Ideographs
r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl.
r'\u3041-\u3096' + # Hiragana
r'\u30A0-\u30FF' + # Katakana
r'\u3400-\u4DB5' + # Kanji
r'\u4E00-\u9FCB' +
r'\uF900-\uFA6A' +
r'\u2E80-\u2FD5' + # Kanji radicals
r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width)
r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters
r'\u3220-\u3243' +
devangari_word = r'\u0901-\u0963'
arabic_word = r'\u0601-\u061A' + \
r'\u061C-\u0669' + \
bengali_word = r'\u0980-\u09FF'
combined_word = devangari_word + arabic_word + bengali_word
word = r'([^\W\d]|[' + combined_word + r'])' + \
r'[\w' + combined_word + r']*' + \
r'([\'’]([\w' + combined_word + r']+|(?=($|\s))))*'
('break', r'(\n|\n\r|\r\n)\s*(\n|\n\r|\r\n)+'),
('whitespace', r'(\n|\n\r|[^\S\n\r]+)'),
('comment_start', r'<!--'),
('comment_end', r'-->'),
("url", url),
('entity', r'&[a-z][a-z0-9]*;'),
('cjk', cjk),
('ref_open', r'<ref\b[^>/]*>'),
('ref_close', r'</ref\b[^>]*>'),
('ref_singleton', r'<ref\b[^>/]*/>'),
('tag', r'</?([a-z][a-z0-9]*)\b[^>]*>'),
('number', r'[\d]+'),
('japan_punct', r'[\u3000-\u303F]'),
('danda', r'।|॥'),
("bold", r"'''"),
("italic", r"''"),
('word', word),
('period', r'\.+'),
('qmark', r'\?+'),
('epoint', r'!+'),
('comma', r',+'),
('colon', r':+'),
('scolon', r';+'),
('dbrack_open', r'\[\['),
('dbrack_close', r'\]\]'),
('brack_open', r'\['),
('brack_close', r'\]'),
('paren_open', r'\('),
('paren_close', r'\)'),
('tab_open', r'\{\|'),
('tab_close', r'\|\}'),
('dcurly_open', r'\{\{'),
('dcurly_close', r'\}\}'),
('curly_open', r'\{'),
('curly_close', r'\}'),
("equals", r"=+"),
("bar", r"\|"),
("etc", r"."),
tokenizer_1 = RT(LEXICON_1)
url2 = r"((?:bitcoin|geo|magnet|mailto|news|sips?|tel|urn):|(?:https?|ftp|ftps|git|gopher|ircs?|mms|nntp|redis|sftp|ssh|svn|telnet|worldwind|xmpp):\/\/)[^\s\/$.?#][\w.\/:]+"
word2 = r'(?:[^\W\d]|[' + combined_word + r'])' + \
r'[\w' + combined_word + r']*' + \
r'(?:[\'’](?:[\w' + combined_word + r']+|(?=(?:$|\s))))*'
('break', r'(?:\n\r?|\r\n)\s*(?:\n\r?|\r\n)+'),
('whitespace', r'(?:\n\r?|[^\S\n\r]+)'),
("url", url2),
("equals", r"=+"),
("bar", r"\|"),
('entity', r'&[a-z][a-z0-9]*;'),
('ref_open', r'<ref\b(?:\/(?!>)|[^>/])*>'),
('ref_close', r'</ref\b[^>]*>'),
('ref_singleton', r'<ref\b[^>/]*/>'),
('tag', r'</?([a-z][a-z0-9]*)\b[^>]*>'),
('number', r'\d+'),
("bold", r"'''"),
("italic", r"''"),
('word', word2),
('tab_open', r'\{\|'),
('tab_close', r'\|\}'),
('dbrack_open', r'\[\['),
('dbrack_close', r'\]\]'),
('brack_open', r'\['),
('brack_close', r'\]'),
('paren_open', r'\('),
('paren_close', r'\)'),
('dcurly_open', r'\{\{'),
('dcurly_close', r'\}\}'),
('curly_open', r'\{'),
('curly_close', r'\}'),
('period', r'\.+'),
('qmark', r'\?+'),
('epoint', r'!+'),
('comma', r',+'),
('colon', r':+'),
('scolon', r';+'),
('comment_start', r'<!--'),
('comment_end', r'-->'),
('japan_punct', r'[\u3000-\u303F]'),
('danda', r'।|॥'),
('cjk', cjk),
("etc", r"."),
tokenizer_2 = RT(LEXICON_2)
# Tokenize using existing regex
def wsplit(text):
return ws.tokenize(text)
# Tokenize using first regex modification
def wsplit_lex_1(text):
return tokenizer_1.tokenize(text)
# Tokenize using second regex modification
def wsplit_lex_2(text):
return tokenizer_2.tokenize(text)
start = time.time()
for i in range(100):
print("We can tokenize ", 1/((time.time() - start)/100), "Alan Turing's per second with wsplit")
start = time.time()
for i in range(100):
print("We can tokenize ", 1/((time.time() - start)/100), "Alan Turing's per second with wsplit_lex_1")
start = time.time()
for i in range(100):
print("We can tokenize ", 1/((time.time() - start)/100), "Alan Turing's per second with wsplit_lex_2")
# The results>>>>>>>>>>
# We can tokenize 5.012405581440883 Alan Turing's per second with wsplit
# We can tokenize 5.756141226473975 Alan Turing's per second with wsplit_lex_1
# We can tokenize 7.274706788039202 Alan Turing's per second with wsplit_lex_2
