Skip to content

Instantly share code, notes, and snippets.

@soldni
Last active October 26, 2022 17:52
Show Gist options
  • Save soldni/90bbb3307133e9fafc9a73709c3318c2 to your computer and use it in GitHub Desktop.
Save soldni/90bbb3307133e9fafc9a73709c3318c2 to your computer and use it in GitHub Desktop.
import subprocess
import sys
subprocess.check_call([
sys.executable,
"-m",
"pip",
"install",
"spacy",
"blingfire",
"tokenizers",
"lorem"
])
import timeit
import lorem
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from blingfire import text_to_words
import tokenizers
hf_tok = tokenizers.pre_tokenizers.BertPreTokenizer()
nlp = English()
spacy_tok = Tokenizer(nlp.vocab)
def tokenize_spacy(text):
return [t.text for t in spacy_tok(text)]
def tokenize_blingfire(text):
return text_to_words(text).split()
def tokenize_hf(text):
return [e for e, _ in hf_tok.pre_tokenize_str(text)]
TIMES = 10_000
p = lorem.text()
print(f"{len(p):,} chars, {TIMES:,} tries")
start_time = timeit.default_timer()
for i in range(TIMES):
tokenize_spacy(p)
print("Spacy: ", timeit.default_timer() - start_time)
start_time = timeit.default_timer()
for i in range(TIMES):
tokenize_blingfire(p)
print("Blingfire: ", timeit.default_timer() - start_time)
start_time = timeit.default_timer()
for i in range(TIMES):
tokenize_hf(p)
print("Huggingface: ", timeit.default_timer() - start_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment