Created
April 24, 2020 14:51
-
-
Save HAKSOAT/f22144d047540746a5a2335fa18bbbfb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Creates an index for the tokenizer | |
import requests | |
param = (('v', ''),) | |
data = r"""{ | |
"settings": { | |
"index.analyze.max_token_count" : 1000000, | |
"analysis": { | |
"analyzer": { | |
"my_analyzer": { | |
"tokenizer": "my_tokenizer" | |
} | |
}, | |
"tokenizer": { | |
"my_tokenizer": { | |
"type": "pattern", | |
"pattern": "(?<commentstart><!--)|(?<commentend>-->)|(?<url>((bitcoin|geo|magnet|mailto|news|sips?|tel|urn)\\:|((|ftp|ftps|git|gopher|https?|ircs?|mms|nntp|redis|sftp|ssh|svn|telnet|worldwind|xmpp)\\:)?\/\/)[^\\s/$.?#].[^\\s]*)|(?<entity>&[a-z][a-z0-9]*;)|(?<cjk>[\u4E00-\u62FF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FCC\u3400-\u4DFF-\ud845\uddff-\ud845\ude00-\ud84c\udcff\ud84c\udd00-\ud851\uddff\ud851\ude00-\ud858\udcff\ud858\udd00-\ud85d\uddff\ud85d\ude00-\ud864\udcff\ud864\udd00-\ud869\udedf\uF900-\uFAFF\ud87e\udc00-\ud87e\ude1f\u3041-\u3096\u30A0-\u30FF\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A\u2E80-\u2FD5\uFF5F-\uFF9F\u31F0-\u31FF\u3220-\u3243\u3280-\u337F])|(?<refopen><ref\\b[^>/]*>)|(?<refsingleton><ref\\b[^>/]*/>)|(?<tag></?([a-z][a-z0-9]*)\\b[^>]*>)|(?<number>[\\d]+)|(?<japanpunct>[\u3000-\u303F])|(?<danda>।|॥)|(?<bold>''')|(?<italic>'')|(?<word>([^\\W\\d]|[\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u09FF])[\\w\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u09FF]*([\\'’]([\\w\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u09FF]+|(?=($|\\s))))*)|(?<period>\\.+)|(?<qmark>\\?+)|(?<epoint>!+)|(?<comma>,+)|(?<colon>:+)|(?<scolon>;+)|(?<break>(\\n|\\n\\r|\\r\\n)\\s*(\\n|\\n\\r|\\r\\n)+)|(?<whitespace>(\\n|\\n\\r|[^\\S\\n\\r]+))|(?<dbrackopen>\\[\\[)|(?<dbrackclose>\\]\\])|(?<brackopen>\\[)|(?<brackclose>\\])|(?<parenopen>\\()|(?<parenclose>\\))|(?<tabopen>\\{\\|)|(?<tabclose>\\|\\})|(?<dcurlyopen>\\{\\{)|(?<dcurlyclose>\\}\\})|(?<curlyopen>\\{)|(?<curlyclose>\\})|(?<equals>=+)|(?<bar>\\|)|(?<etc>.)", | |
"group": 0 | |
} | |
} | |
} | |
} | |
}""".encode('utf-8') | |
resp = requests.put('http://localhost:9200/my_index', data=data, headers={'content-type':'application/json'}) | |
# Extracts the text used for the performance test | |
import time | |
import mwapi | |
session = mwapi.Session("https://en.wikipedia.org") | |
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2) | |
text = doc['query']['pages'][0]['revisions'][0]['content'] | |
# Functions for tokenization | |
import json | |
from deltas.tokenizers import wikitext_split as ws | |
def elast(text): | |
data = json.dumps({"analyzer": "my_analyzer","text": text}) | |
resp = requests.post('http://localhost:9200/my_index/_analyze', data=data, headers={'content-type':'application/json'}) | |
return json.loads(resp.text) | |
def wsplit(text): | |
return list(ws.tokenize(text)) | |
start = time.time() | |
for i in range(100): | |
wsplit(text) | |
print("We can tokenize ", 1/((time.time() - start)/100), "Alan Turing's per second with wsplit") | |
start = time.time() | |
for i in range(100): | |
elast(text) | |
print("We can tokenize ", 1/((time.time() - start)/100), "Alan Turing's per second with elastic") | |
# The results>>>>>>>>>> | |
# We can tokenize 5.827891467459811 Alan Turing's per second with wsplit | |
# We can tokenize 3.128938281450994 Alan Turing's per second with elastic |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment