Last active
February 18, 2023 19:09
-
-
Save PBPatil/dfbf3217f96bc8e49bd6052d9d8f4e54 to your computer and use it in GitHub Desktop.
Comparing Several Tokenizers in NLTK library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "9fb61d27", | |
"metadata": {}, | |
"source": [ | |
"Here I am comparing results from different tokenizers available on nltk library. There are about 15 tokenziers I have kept in compare_tokenizers- a user defined function. You can play along and try several other tokenizers available from other libraries ex. Spacy, Gensim, Keras, Textblob etc." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "a31bd6a3", | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"WhitespaceTokenizer: ['Hello', '#World.je', 'suis', 'dans', 'un', 'resto.', '今までで最高のピザを食べました。😍...!!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"WordPunctTokenizer: ['Hello', '#', 'World', '.', 'je', 'suis', 'dans', 'un', 'resto', '.', '今までで最高のピザを食べました', '。😍...!!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"TreebankWordTokenizer: ['Hello', '#', 'World.je', 'suis', 'dans', 'un', 'resto.', '今までで最高のピザを食べました。😍', '...', '!', '!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"PunktSentenceTokenizer: ['Hello #World.je suis dans un resto.', '今までで最高のピザを食べました。😍...!', '!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"RegexpTokenizer: ['Hello', 'World', 'je', 'suis', 'dans', 'un', 'resto', '今までで最高のピザを食べました']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"SExprTokenizer: ['Hello #World.je suis dans un resto. 今までで最高のピザを食べました。😍...!!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"LineTokenizer: ['Hello #World.je suis dans un resto. 今までで最高のピザを食べました。😍...!!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"SpaceTokenizer: ['Hello', '#World.je', 'suis', 'dans', 'un', 'resto.', '今までで最高のピザを食べました。😍...!!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"TweetTokenizer: ['Hello', '#World', '.', 'je', 'suis', 'dans', 'un', 'resto', '.', '今までで最高のピザを食べました', '。', '😍', '...', '!', '!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"MosesTokenizer: ['Hello', '#', 'World.je', 'suis', 'dans', 'un', 'resto', '.', '今', 'までで', '最', '高', 'のピザを', '食', 'べました', '。', '😍', '...', '!', '!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"ToktokTokenizer: ['Hello', '#World.je', 'suis', 'dans', 'un', 'resto.', '今までで最高のピザを食べました。😍', '...', '!', '!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"MWETokenizer: ['H', 'e', 'l', 'l', 'o', ' ', '#', 'W', 'o', 'r', 'l', 'd', '.', 'j', 'e', ' ', 's', 'u', 'i', 's', ' ', 'd', 'a', 'n', 's', ' ', 'u', 'n', ' ', 'r', 'e', 's', 't', 'o', '.', ' ', '今', 'ま', 'で', 'で', '最', '高', 'の', 'ピ', 'ザ', 'を', '食', 'べ', 'ま', 'し', 'た', '。', '😍', '.', '.', '.', '!', '!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"SyllableTokenizer: ['Hel', 'lo ', '#', 'World', '.', 'je ', 'sui', 's ', 'dan', 's u', 'n ', 'res', 'to', '.', ' 今', 'ま', 'で', 'で', '最', '高', 'の', 'ピ', 'ザ', 'を', '食', 'べ', 'ま', 'し', 'た', '。', '😍', '.', '', '.', '', '.', '', '!', '!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"TabTokenizer: ['Hello #World.je suis dans un resto. 今までで最高のピザを食べました。😍...!!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n", | |
"BlanklineTokenizer: ['Hello #World.je suis dans un resto. 今までで最高のピザを食べました。😍...!!']\n", | |
"-----------------------------------------------------------------------------------------------------------------------------\n" | |
] | |
} | |
], | |
"source": [ | |
"## Get multiple outputs in the same cell\n", | |
"from IPython.core.interactiveshell import InteractiveShell\n", | |
"InteractiveShell.ast_node_interactivity = \"all\"\n", | |
"\n", | |
"## Ignore all warnings\n", | |
"import warnings\n", | |
"warnings.filterwarnings(\"ignore\")\n", | |
"\n", | |
"import nltk\n", | |
"from nltk.tokenize import (WhitespaceTokenizer,WordPunctTokenizer,TreebankWordTokenizer,\n", | |
" PunktSentenceTokenizer,RegexpTokenizer,SExprTokenizer,LineTokenizer,\n", | |
" SpaceTokenizer,TweetTokenizer,ToktokTokenizer,MWETokenizer,SyllableTokenizer,\n", | |
" TabTokenizer,word_tokenize,sent_tokenize,BlanklineTokenizer)\n", | |
"\n", | |
"#!pip install -U sacremoses\n", | |
"from sacremoses import MosesTokenizer #This one is moved out of the nltk library and can be found in sacremoses.\n", | |
"\n", | |
"text = \"Hello #World.je suis dans un resto. 今までで最高のピザを食べました。😍...!!\"\n", | |
"\n", | |
"def compare_tokenizers(text):\n", | |
" tokenizers = [\n", | |
" WhitespaceTokenizer(),\n", | |
" WordPunctTokenizer(),\n", | |
" TreebankWordTokenizer(),\n", | |
" PunktSentenceTokenizer(),\n", | |
" RegexpTokenizer('\\w+'),\n", | |
" SExprTokenizer(),\n", | |
" LineTokenizer(),\n", | |
" SpaceTokenizer(),\n", | |
" TweetTokenizer(),\n", | |
" MosesTokenizer(),\n", | |
" ToktokTokenizer(),\n", | |
" MWETokenizer(),\n", | |
" SyllableTokenizer(),\n", | |
" TabTokenizer(),\n", | |
" BlanklineTokenizer()\n", | |
" ]\n", | |
"\n", | |
" for tokenizer in tokenizers:\n", | |
" print(f\"{tokenizer.__class__.__name__}: {tokenizer.tokenize(text)}\")\n", | |
" print(\"-\"*125)\n", | |
" \n", | |
"compare_tokenizers(text) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ef70983a", | |
"metadata": {}, | |
"source": [ | |
"Depending on your use case you can pick and choose whichever tokenizer best suitable in terms of tokens it generates." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment