Skip to content

Instantly share code, notes, and snippets.

@marrrcin
Forked from LysandreJik/broken-tokenizer.ipynb
Created March 1, 2021 09:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marrrcin/1d9e1ee32b8b39ce0e1c9dcd3562b44a to your computer and use it in GitHub Desktop.
Save marrrcin/1d9e1ee32b8b39ce0e1c9dcd3562b44a to your computer and use it in GitHub Desktop.
Broken Tokenizer
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Broken Tokenizer",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/LysandreJik/04c7cfe3d2656ae1c4c388ce9cdd3ea4/broken-tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "79DQ3WqLT1rz",
"outputId": "528af80d-f6f6-4e1c-8f62-2cfda6f054ef"
},
"source": [
"!pip install tokenizers==0.10.1 transformers==4.3.3"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting tokenizers==0.10.1\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)\n",
"\u001b[K |████████████████████████████████| 3.2MB 6.7MB/s \n",
"\u001b[?25hCollecting transformers==4.3.3\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)\n",
"\u001b[K |████████████████████████████████| 1.9MB 34.5MB/s \n",
"\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (20.9)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (1.19.5)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (3.0.12)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (2.23.0)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (4.41.1)\n",
"Collecting sacremoses\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n",
"\u001b[K |████████████████████████████████| 890kB 35.6MB/s \n",
"\u001b[?25hRequirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (3.4.0)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (2019.12.20)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.3.3) (2.4.7)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (3.0.4)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (2.10)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (2020.12.5)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (1.24.3)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.3) (1.15.0)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.3) (7.1.2)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.3) (1.0.1)\n",
"Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers==4.3.3) (3.7.4.3)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers==4.3.3) (3.4.0)\n",
"Building wheels for collected packages: sacremoses\n",
" Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=cd09dc6605e1a6a1c26668f0d347aad64598e3f4334867237d6ad1de8b936295\n",
" Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n",
"Successfully built sacremoses\n",
"Installing collected packages: tokenizers, sacremoses, transformers\n",
"Successfully installed sacremoses-0.0.43 tokenizers-0.10.1 transformers-4.3.3\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nbrq0DMFT5s8",
"outputId": "b742df55-cfd8-4b3e-97ae-5474ad42d6c3"
},
"source": [
"!wget https://www.gutenberg.org/files/1112/1112.txt -O data.txt"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"--2021-02-25 20:32:20-- https://www.gutenberg.org/files/1112/1112.txt\n",
"Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n",
"Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 179410 (175K) [text/plain]\n",
"Saving to: ‘data.txt’\n",
"\n",
"data.txt 100%[===================>] 175.21K 882KB/s in 0.2s \n",
"\n",
"2021-02-25 20:32:21 (882 KB/s) - ‘data.txt’ saved [179410/179410]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Ach1EHSuUDXA"
},
"source": [
"from tokenizers import ByteLevelBPETokenizer\n",
"from tokenizers.processors import RobertaProcessing"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "VyMUQYK8UhQY"
},
"source": [
"from pathlib import Path\n",
"import shutil\n",
"workdir = Path(\"./workdir\")\n",
"workdir.mkdir(exist_ok=True)"
],
"execution_count": 29,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_z4fwwY7UEq5",
"outputId": "d1c6bdb1-fe18-42c4-e5e4-0f1e4cb29861"
},
"source": [
" tokenizer = ByteLevelBPETokenizer(lowercase=False)\n",
" tokenizer.post_processor = RobertaProcessing(sep=(\"<s>\", 2), cls=(\"</s>\", 0))\n",
" tokenizer.train(\n",
" [\"./data.txt\"],\n",
" vocab_size=1234,\n",
" special_tokens=[\"<s>\", \"<pad>\", \"</s>\", \"<unk>\", \"<mask>\"],\n",
" min_frequency=5,\n",
" )\n",
" tokenizer_output_path = workdir / \"tokenizer\"\n",
" shutil.rmtree(tokenizer_output_path)\n",
" tokenizer_output_path.mkdir(exist_ok=False)\n",
" tokenizer_json = tokenizer_output_path / \"tokenizer.json\"\n",
" tokenizer.save(str(tokenizer_json.absolute()), True)\n",
" tokenizer.save_model(str(tokenizer_output_path.absolute()))\n",
"\n"
],
"execution_count": 30,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['/content/workdir/tokenizer/vocab.json',\n",
" '/content/workdir/tokenizer/merges.txt']"
]
},
"metadata": {
"tags": []
},
"execution_count": 30
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qhAbNtAaUny4",
"outputId": "e4e2c3da-61e1-4872-d6e7-e1a6f4a44670"
},
"source": [
"!ls ./workdir/tokenizer"
],
"execution_count": 31,
"outputs": [
{
"output_type": "stream",
"text": [
"merges.txt tokenizer.json vocab.json\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "iBGe4YdaUvi8"
},
"source": [
"from transformers import RobertaTokenizerFast, RobertaTokenizer"
],
"execution_count": 32,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "j0HrRzbrU0sn"
},
"source": [
"tfast = RobertaTokenizerFast.from_pretrained(\"./workdir/tokenizer\", model_max_length=10)"
],
"execution_count": 33,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aMNukldtU323",
"outputId": "028ee0ee-a01c-48d7-cd0f-aa7c840db2d4"
},
"source": [
"tfast(\"asd\", add_special_tokens=True)"
],
"execution_count": 34,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'input_ids': [0, 400, 72, 2], 'attention_mask': [1, 1, 1, 1]}"
]
},
"metadata": {
"tags": []
},
"execution_count": 34
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "91Ed-y5xU8Xc",
"outputId": "f7dcb8f8-c166-4ff6-9b01-757b06e04e18"
},
"source": [
"tslow = RobertaTokenizer.from_pretrained(\"./workdir/tokenizer\", model_max_length=10)\n",
"tslow(\"asd\", add_special_tokens=True)"
],
"execution_count": 35,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'input_ids': [0, 400, 72, 2], 'attention_mask': [1, 1, 1, 1]}"
]
},
"metadata": {
"tags": []
},
"execution_count": 35
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "_47Pb8CEVEnn"
},
"source": [
""
],
"execution_count": 10,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment