piegu/Byte-Level-BPE_universal_tokenizer_but_en_tokenizer_3_languages.ipynb

## Byte-Level-BPE_universal_tokenizer_but_en_tokenizer_3_languages.ipynb
# English pre-trained tokenizer on a text in 3 languages (en, pt, fr)

# text in 3 languages to be tokenized
text_en = 'Jacques-Germain Soufflot (Irancy, July 22, 1713 - Paris, August 29, 1780) was a French architect, initiator of the architectural style of Neoclassicism.'
text_pt = 'Jacques-Germain Soufflot (Irancy, 22 de julho de 1713 — Paris, 29 de agosto de 1780) foi um arquitecto francês, iniciador do estilo arquitectónico do Neoclassicismo.'
text_fr = 'Jacques-Germain Soufflot (Irancy, 22 juillet 1713 - Paris, 29 août 1780) était un architecte français, initiateur du style architectural du néoclassicisme.'

langs = ['en', 'pt', 'fr']
texts = [text_en,text_pt,text_fr]

for lang,text in zip(*[langs,texts]):
    print(f'({lang}) {TitledStr(text)}\n')

# number and list of classical tokens (ie, tokens separated by a blank)
for lang,text in zip(*[langs,texts]):
    print(f'({lang} - {len(text.split())} tokens) {TitledStr(text.split(" "))}\n')

# number and list of tokens
# after the text tokenization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
for lang,text in zip(*[langs,texts]):
    toks = tokenizer_en.tokenize(text)
    print(f'({lang} - {len(toks)} tokens) {TitledStr(toks)}\n')

# number and list of tokens ids
# after the text tokenization + numerization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
for lang,text in zip(*[langs,texts]):
    toks_ids = tokenizer_en.encode(text)
    print(f'({lang} - {len(toks_ids)} tokens) {TitledStr(toks_ids)}\n')

# decode (back to the text)
for lang,text in zip(*[langs,texts]):
    toks_ids = tokenizer_en.encode(text)
    text_decoded = tokenizer_en.decode(toks_ids)
    print(f'({lang}) {TitledStr(text_decoded)}\n')

# graph
# source: https://matplotlib.org/3.2.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
text_split = list()
toks_split = list()

for text in texts:
    text_split.append(len(text.split()))
    toks_ids = tokenizer_en.encode(text)
    toks_split.append(len(toks_ids))

labels = langs
xy = list(np.array([1.,2.,3.]) - 0.2)
xz = list(np.array([1.,2.,3.]) + 0.2)
y = text_split
z = toks_split

ax = plt.subplot(111)
ax.bar(xy, y, width=0.4, color='b', align='center')
ax.bar(xz, z, width=0.4, color='g', align='center')

ax.set_xlabel('languages')
ax.set_xticks(range(1,len(labels)+1))
ax.set_xticklabels(labels)
ax.set_ylabel('number of tokens')
ax.legend(['split(" ")', 'GPTTokenizerFast (en)'])

ax.set_title('Number of tokens by tokenization method and lang')

plt.show()
	# English pre-trained tokenizer on a text in 3 languages (en, pt, fr)

	# text in 3 languages to be tokenized
	text_en = 'Jacques-Germain Soufflot (Irancy, July 22, 1713 - Paris, August 29, 1780) was a French architect, initiator of the architectural style of Neoclassicism.'
	text_pt = 'Jacques-Germain Soufflot (Irancy, 22 de julho de 1713 — Paris, 29 de agosto de 1780) foi um arquitecto francês, iniciador do estilo arquitectónico do Neoclassicismo.'
	text_fr = 'Jacques-Germain Soufflot (Irancy, 22 juillet 1713 - Paris, 29 août 1780) était un architecte français, initiateur du style architectural du néoclassicisme.'

	langs = ['en', 'pt', 'fr']
	texts = [text_en,text_pt,text_fr]

	for lang,text in zip(*[langs,texts]):
	print(f'({lang}) {TitledStr(text)}\n')

	# number and list of classical tokens (ie, tokens separated by a blank)
	for lang,text in zip(*[langs,texts]):
	print(f'({lang} - {len(text.split())} tokens) {TitledStr(text.split(" "))}\n')

	# number and list of tokens
	# after the text tokenization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
	for lang,text in zip(*[langs,texts]):
	toks = tokenizer_en.tokenize(text)
	print(f'({lang} - {len(toks)} tokens) {TitledStr(toks)}\n')

	# number and list of tokens ids
	# after the text tokenization + numerization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
	for lang,text in zip(*[langs,texts]):
	toks_ids = tokenizer_en.encode(text)
	print(f'({lang} - {len(toks_ids)} tokens) {TitledStr(toks_ids)}\n')

	# decode (back to the text)
	for lang,text in zip(*[langs,texts]):
	toks_ids = tokenizer_en.encode(text)
	text_decoded = tokenizer_en.decode(toks_ids)
	print(f'({lang}) {TitledStr(text_decoded)}\n')

	# graph
	# source: https://matplotlib.org/3.2.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
	text_split = list()
	toks_split = list()

	for text in texts:
	text_split.append(len(text.split()))
	toks_ids = tokenizer_en.encode(text)
	toks_split.append(len(toks_ids))

	labels = langs
	xy = list(np.array([1.,2.,3.]) - 0.2)
	xz = list(np.array([1.,2.,3.]) + 0.2)
	y = text_split
	z = toks_split

	ax = plt.subplot(111)
	ax.bar(xy, y, width=0.4, color='b', align='center')
	ax.bar(xz, z, width=0.4, color='g', align='center')

	ax.set_xlabel('languages')
	ax.set_xticks(range(1,len(labels)+1))
	ax.set_xticklabels(labels)
	ax.set_ylabel('number of tokens')
	ax.legend(['split(" ")', 'GPTTokenizerFast (en)'])

	ax.set_title('Number of tokens by tokenization method and lang')

	plt.show()