Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save piegu/3bb3786834839e140c057c5697fa6132 to your computer and use it in GitHub Desktop.
Save piegu/3bb3786834839e140c057c5697fa6132 to your computer and use it in GitHub Desktop.
English vs Portuguese tokenizer on Portuguese Wikipedia of Byte-Level-BPE_universal_tokenizer_but_en_tokenizer.ipynb
# English vs Portuguese tokenizer on Portuguese Wikipedia
lang = 'pt'
fname = f'all_texts_{lang}wiki.csv'
df = pd.read_csv(path_data/fname)
df2 = df.copy()
tokens_en_list = list()
num_token_by_word_en_list = list()
tokens_pt_list = list()
num_token_by_word_pt_list = list()
for index, row in df2.iterrows():
text = row['text']
tokens_en = tokenizer_en.encode(text)
tokens_pt = tokenizer_pt.encode(text)
tokens_en_list.append(tokens_en)
tokens_pt_list.append(tokens_pt)
length_text = len(text.split())
tokens_by_word_en = len(tokens_en)/length_text
tokens_by_word_pt = len(tokens_pt)/length_text
num_token_by_word_en_list.append(tokens_by_word_en)
num_token_by_word_pt_list.append(tokens_by_word_pt)
df2['tokens_en'] = tokens_en_list
df2['num_token_by_word_en'] = num_token_by_word_en_list
df2['tokens_pt'] = tokens_pt_list
df2['num_token_by_word_pt'] = num_token_by_word_pt_list
# check min
num_token_by_word_en_min = df2.num_token_by_word_en.min()
num_token_by_word_pt_min = df2.num_token_by_word_pt.min()
print('(en)',round(num_token_by_word_en_min,2))
print('(pt)',round(num_token_by_word_pt_min,2))
# check max
num_token_by_word_en_max = df2.num_token_by_word_en.max()
num_token_by_word_pt_max = df2.num_token_by_word_pt.max()
print('(en)',round(num_token_by_word_en_max,2))
print('(pt)',round(num_token_by_word_pt_max,2))
# check mean
num_token_by_word_en_mean = df2.num_token_by_word_en.mean()
num_token_by_word_pt_mean = df2.num_token_by_word_pt.mean()
print('(en)',round(num_token_by_word_en_mean,2))
print('(pt)',round(num_token_by_word_pt_mean,2))
# check increase rate and Multiplier coefficient
increase = 0.
multiplier = 0.
for tok_en,tok_pt in zip(*(tokens_en_list,tokens_pt_list)):
increase += (len(tok_en)-len(tok_pt))/len(tok_pt)
multiplier += len(tok_en)/len(tok_pt)
# Rate of increase in % from pt to en
increase_pct = increase / len(tokens_en_list)
print('Rate of increase:',round(increase_pct*100,2),'%')
# Multiplier coefficient = (Rate of increase in %, converted to number) + 1
multiplier_coef = round(increase_pct+1,2)
print('Multiplier coefficient:',multiplier_coef)
# Multiplier coefficient in % = Multiplier coefficient, converted to %
multiplier_pct = round((multiplier/len(tokens_en_list))*100,2)
print('Multiplier coefficient in %:',multiplier_pct,'%')
# graph
len_tokens_text_list = list()
for index, row in df2.iterrows():
text = row['text']
length_text = len(text.split())
len_tokens_text_list.append(length_text)
tokens_en_list = df2.tokens_en.tolist()
len_tokens_en_list = [len(t) for t in tokens_en_list]
tokens_pt_list = df2.tokens_pt.tolist()
len_tokens_pt_list = [len(t) for t in tokens_pt_list]
sorted_len_tokens_text_list = sorted(len_tokens_text_list)
y_len_tokens_en_list = (12*np.array(sorted_len_tokens_text_list)).tolist()
y_len_tokens_pt_list = (7*np.array(sorted_len_tokens_text_list)).tolist()
ax = plt.subplot(111)
ax.scatter(len_tokens_text_list, len_tokens_en_list)
ax.plot(sorted_len_tokens_text_list, y_len_tokens_en_list)
ax.scatter(len_tokens_text_list, len_tokens_pt_list)
ax.plot(sorted_len_tokens_text_list, y_len_tokens_pt_list)
ax.set_xlabel('length of texts')
ax.set_ylabel('length of en and pt tokens')
ax.legend(['en', 'pt'])
ax.set_title('Number of tokens by tokenization method')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment