Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save piegu/d320edd105537b53a95cae31a266998f to your computer and use it in GitHub Desktop.
Save piegu/d320edd105537b53a95cae31a266998f to your computer and use it in GitHub Desktop.
en tokenizer on a text in 3 languages of Byte-Level-BPE_universal_tokenizer_but.ipynb
# English pre-trained tokenizer on a text in 3 languages (en, pt, fr)
# text in 3 languages to be tokenized
text_en = 'Jacques-Germain Soufflot (Irancy, July 22, 1713 - Paris, August 29, 1780) was a French architect, initiator of the architectural style of Neoclassicism.'
text_pt = 'Jacques-Germain Soufflot (Irancy, 22 de julho de 1713 — Paris, 29 de agosto de 1780) foi um arquitecto francês, iniciador do estilo arquitectónico do Neoclassicismo.'
text_fr = 'Jacques-Germain Soufflot (Irancy, 22 juillet 1713 - Paris, 29 août 1780) était un architecte français, initiateur du style architectural du néoclassicisme.'
langs = ['en', 'pt', 'fr']
texts = [text_en,text_pt,text_fr]
for lang,text in zip(*[langs,texts]):
print(f'({lang}) {TitledStr(text)}\n')
# number and list of classical tokens (ie, tokens separated by a blank)
for lang,text in zip(*[langs,texts]):
print(f'({lang} - {len(text.split())} tokens) {TitledStr(text.split(" "))}\n')
# number and list of tokens
# after the text tokenization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
for lang,text in zip(*[langs,texts]):
toks = tokenizer_en.tokenize(text)
print(f'({lang} - {len(toks)} tokens) {TitledStr(toks)}\n')
# number and list of tokens ids
# after the text tokenization + numerization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
for lang,text in zip(*[langs,texts]):
toks_ids = tokenizer_en.encode(text)
print(f'({lang} - {len(toks_ids)} tokens) {TitledStr(toks_ids)}\n')
# decode (back to the text)
for lang,text in zip(*[langs,texts]):
toks_ids = tokenizer_en.encode(text)
text_decoded = tokenizer_en.decode(toks_ids)
print(f'({lang}) {TitledStr(text_decoded)}\n')
# graph
# source:
text_split = list()
toks_split = list()
for text in texts:
toks_ids = tokenizer_en.encode(text)
labels = langs
xy = list(np.array([1.,2.,3.]) - 0.2)
xz = list(np.array([1.,2.,3.]) + 0.2)
y = text_split
z = toks_split
ax = plt.subplot(111), y, width=0.4, color='b', align='center'), z, width=0.4, color='g', align='center')
ax.set_ylabel('number of tokens')
ax.legend(['split(" ")', 'GPTTokenizerFast (en)'])
ax.set_title('Number of tokens by tokenization method and lang')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment