Created
June 11, 2021 17:24
-
-
Save jamescalam/10f8828bc6a85c403fa12c2a9f1416a1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# test our tokenizer on a simple sentence\n", | |
"tokens = tokenizer.encode('ciao, come va?') # 'hi, how are you?'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 6 | |
} | |
], | |
"source": [ | |
"tokens # this is our encodings object, with several tensors including ids and attention_mask" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['[CLS]', 'ciao', ',', 'Ġcome', 'Ġva', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]']" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 7 | |
} | |
], | |
"source": [ | |
"tokens.tokens[:10] # we can view the tokens here (eg output of merges.txt)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[1, 16834, 16, 488, 611, 35, 2, 0, 0, 0]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 8 | |
} | |
], | |
"source": [ | |
"tokens.ids[:10] # and here are the token ids (output of vocab.json)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "ML", | |
"language": "python", | |
"name": "ml" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment