Skip to content

Instantly share code, notes, and snippets.

@christianb93
Created April 8, 2023 12:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save christianb93/227ace67b2290c4719351ab86560a7d3 to your computer and use it in GitHub Desktop.
Save christianb93/227ace67b2290c4719351ab86560a7d3 to your computer and use it in GitHub Desktop.
Post #1 of the NLP series
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "425688a2-2705-464a-b770-6d2c9d3a4eab",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['My', 'name', 'is', 'John.', 'What', 'is', 'your', 'name?']\n"
]
}
],
"source": [
"text = \"My name is John. What is your name?\"\n",
"token = text.split()\n",
"print(token)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6d5a6ec9-76cc-4974-838a-b102db531038",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['my', 'name', 'is', 'john', '.', 'what', 'is', 'your', 'name', '?']\n"
]
}
],
"source": [
"import torchtext\n",
"\n",
"tokenizer = torchtext.data.utils.get_tokenizer(\"basic_english\")\n",
"print(tokenizer(text))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "57219c34-6f7d-42f2-9743-4b17f8c69f8e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['my', 'name', 'is', 'john', '.', 'what', 'your', '?']\n"
]
}
],
"source": [
"import collections\n",
"token = tokenizer(text)\n",
"counter = collections.Counter(token)\n",
"vocabulary = [t for t in counter.keys()]\n",
"print(vocabulary)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "12b58f2c-d5d8-474c-86a0-610ff0e4c71b",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0, 1, 2, 3, 4, 5, 2, 6, 1, 7]\n"
]
}
],
"source": [
"stois = dict()\n",
"for idx, t in enumerate(vocabulary):\n",
" stois[t] = idx\n",
"encoded_text = [stois[t] for t in token]\n",
"print(encoded_text)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "99b6c8dd-df0b-4b9a-9914-0fc5f3b47837",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"my name is john . what is your name ?\n"
]
}
],
"source": [
"decoded_text = \" \".join([vocabulary[idx] for idx in encoded_text])\n",
"print(decoded_text)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment