Skip to content

Instantly share code, notes, and snippets.

@HAKSOAT
Created April 24, 2020 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HAKSOAT/f22144d047540746a5a2335fa18bbbfb to your computer and use it in GitHub Desktop.
Save HAKSOAT/f22144d047540746a5a2335fa18bbbfb to your computer and use it in GitHub Desktop.
# Creates an index for the tokenizer
import requests
param = (('v', ''),)
data = r"""{
"settings": {
"index.analyze.max_token_count" : 1000000,
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "pattern",
"pattern": "(?<commentstart><!--)|(?<commentend>-->)|(?<url>((bitcoin|geo|magnet|mailto|news|sips?|tel|urn)\\:|((|ftp|ftps|git|gopher|https?|ircs?|mms|nntp|redis|sftp|ssh|svn|telnet|worldwind|xmpp)\\:)?\/\/)[^\\s/$.?#].[^\\s]*)|(?<entity>&[a-z][a-z0-9]*;)|(?<cjk>[\u4E00-\u62FF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FCC\u3400-\u4DFF-\ud845\uddff-\ud845\ude00-\ud84c\udcff\ud84c\udd00-\ud851\uddff\ud851\ude00-\ud858\udcff\ud858\udd00-\ud85d\uddff\ud85d\ude00-\ud864\udcff\ud864\udd00-\ud869\udedf\uF900-\uFAFF\ud87e\udc00-\ud87e\ude1f\u3041-\u3096\u30A0-\u30FF\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A\u2E80-\u2FD5\uFF5F-\uFF9F\u31F0-\u31FF\u3220-\u3243\u3280-\u337F])|(?<refopen><ref\\b[^>/]*>)|(?<refsingleton><ref\\b[^>/]*/>)|(?<tag></?([a-z][a-z0-9]*)\\b[^>]*>)|(?<number>[\\d]+)|(?<japanpunct>[\u3000-\u303F])|(?<danda>।|॥)|(?<bold>''')|(?<italic>'')|(?<word>([^\\W\\d]|[\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u09FF])[\\w\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u09FF]*([\\'’]([\\w\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u09FF]+|(?=($|\\s))))*)|(?<period>\\.+)|(?<qmark>\\?+)|(?<epoint>!+)|(?<comma>,+)|(?<colon>:+)|(?<scolon>;+)|(?<break>(\\n|\\n\\r|\\r\\n)\\s*(\\n|\\n\\r|\\r\\n)+)|(?<whitespace>(\\n|\\n\\r|[^\\S\\n\\r]+))|(?<dbrackopen>\\[\\[)|(?<dbrackclose>\\]\\])|(?<brackopen>\\[)|(?<brackclose>\\])|(?<parenopen>\\()|(?<parenclose>\\))|(?<tabopen>\\{\\|)|(?<tabclose>\\|\\})|(?<dcurlyopen>\\{\\{)|(?<dcurlyclose>\\}\\})|(?<curlyopen>\\{)|(?<curlyclose>\\})|(?<equals>=+)|(?<bar>\\|)|(?<etc>.)",
"group": 0
}
}
}
}
}""".encode('utf-8')
resp = requests.put('http://localhost:9200/my_index', data=data, headers={'content-type':'application/json'})
# Extracts the text used for the performance test
import time
import mwapi
session = mwapi.Session("https://en.wikipedia.org")
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['content']
# Functions for tokenization
import json
from deltas.tokenizers import wikitext_split as ws
def elast(text):
data = json.dumps({"analyzer": "my_analyzer","text": text})
resp = requests.post('http://localhost:9200/my_index/_analyze', data=data, headers={'content-type':'application/json'})
return json.loads(resp.text)
def wsplit(text):
return list(ws.tokenize(text))
start = time.time()
for i in range(100):
wsplit(text)
print("We can tokenize ", 1/((time.time() - start)/100), "Alan Turing's per second with wsplit")
start = time.time()
for i in range(100):
elast(text)
print("We can tokenize ", 1/((time.time() - start)/100), "Alan Turing's per second with elastic")
# The results>>>>>>>>>>
# We can tokenize 5.827891467459811 Alan Turing's per second with wsplit
# We can tokenize 3.128938281450994 Alan Turing's per second with elastic
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment