Created
November 6, 2021 11:06
Revisions
-
BadreeshShetty created this gist
Nov 6, 2021 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,307 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Lemmatizing" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>label</th>\n", " <th>body_text</th>\n", " <th>body_text_clean</th>\n", " <th>body_text_tokenized</th>\n", " <th>body_text_nostop</th>\n", " <th>body_text_stemmed</th>\n", " <th>body_text_lemmatized</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>ham</td>\n", " <td>I've been searching for the right words to tha...</td>\n", " <td>Ive been searching for the right words to than...</td>\n", " <td>[ive, been, searching, for, the, right, words,...</td>\n", " <td>[ive, searching, right, words, thank, breather...</td>\n", " <td>[ive, search, right, word, thank, breather, pr...</td>\n", " <td>[ive, searching, right, word, thank, breather,...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>spam</td>\n", " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n", " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n", " <td>[free, entry, in, 2, a, wkly, comp, to, win, f...</td>\n", " <td>[free, entry, 2, wkly, comp, win, fa, cup, fin...</td>\n", " <td>[free, entri, 2, wkli, comp, win, fa, cup, fin...</td>\n", " <td>[free, entry, 2, wkly, comp, win, fa, cup, fin...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>ham</td>\n", " <td>Nah I don't think he goes to usf, he lives aro...</td>\n", " <td>Nah I dont think he goes to usf he lives aroun...</td>\n", " <td>[nah, i, dont, think, he, goes, to, usf, he, l...</td>\n", " <td>[nah, dont, think, goes, usf, lives, around, t...</td>\n", " <td>[nah, dont, think, goe, usf, live, around, tho...</td>\n", " <td>[nah, dont, think, go, usf, life, around, though]</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>ham</td>\n", " <td>Even my brother is not like to speak with me. ...</td>\n", " <td>Even my brother is not like to speak with me T...</td>\n", " <td>[even, my, brother, is, not, like, to, speak, ...</td>\n", " <td>[even, brother, like, speak, treat, like, aids...</td>\n", " <td>[even, brother, like, speak, treat, like, aid,...</td>\n", " <td>[even, brother, like, speak, treat, like, aid,...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>ham</td>\n", " <td>I HAVE A DATE ON SUNDAY WITH WILL!!</td>\n", " <td>I HAVE A DATE ON SUNDAY WITH WILL</td>\n", " <td>[i, have, a, date, on, sunday, with, will]</td>\n", " <td>[date, sunday]</td>\n", " <td>[date, sunday]</td>\n", " <td>[date, sunday]</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>ham</td>\n", " <td>As per your request 'Melle Melle (Oru Minnamin...</td>\n", " <td>As per your request Melle Melle Oru Minnaminun...</td>\n", " <td>[as, per, your, request, melle, melle, oru, mi...</td>\n", " <td>[per, request, melle, melle, oru, minnaminungi...</td>\n", " <td>[per, request, mell, mell, oru, minnaminungint...</td>\n", " <td>[per, request, melle, melle, oru, minnaminungi...</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>spam</td>\n", " <td>WINNER!! As a valued network customer you have...</td>\n", " <td>WINNER As a valued network customer you have b...</td>\n", " <td>[winner, as, a, valued, network, customer, you...</td>\n", " <td>[winner, valued, network, customer, selected, ...</td>\n", " <td>[winner, valu, network, custom, select, receiv...</td>\n", " <td>[winner, valued, network, customer, selected, ...</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>spam</td>\n", " <td>Had your mobile 11 months or more? U R entitle...</td>\n", " <td>Had your mobile 11 months or more U R entitled...</td>\n", " <td>[had, your, mobile, 11, months, or, more, u, r...</td>\n", " <td>[mobile, 11, months, u, r, entitled, update, l...</td>\n", " <td>[mobil, 11, month, u, r, entitl, updat, latest...</td>\n", " <td>[mobile, 11, month, u, r, entitled, update, la...</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>ham</td>\n", " <td>I'm gonna be home soon and i don't want to tal...</td>\n", " <td>Im gonna be home soon and i dont want to talk ...</td>\n", " <td>[im, gonna, be, home, soon, and, i, dont, want...</td>\n", " <td>[im, gonna, home, soon, dont, want, talk, stuf...</td>\n", " <td>[im, gonna, home, soon, dont, want, talk, stuf...</td>\n", " <td>[im, gonna, home, soon, dont, want, talk, stuf...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>spam</td>\n", " <td>SIX chances to win CASH! From 100 to 20,000 po...</td>\n", " <td>SIX chances to win CASH From 100 to 20000 poun...</td>\n", " <td>[six, chances, to, win, cash, from, 100, to, 2...</td>\n", " <td>[six, chances, win, cash, 100, 20000, pounds, ...</td>\n", " <td>[six, chanc, win, cash, 100, 20000, pound, txt...</td>\n", " <td>[six, chance, win, cash, 100, 20000, pound, tx...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " label body_text \\\n", "0 ham I've been searching for the right words to tha... \n", "1 spam Free entry in 2 a wkly comp to win FA Cup fina... \n", "2 ham Nah I don't think he goes to usf, he lives aro... \n", "3 ham Even my brother is not like to speak with me. ... \n", "4 ham I HAVE A DATE ON SUNDAY WITH WILL!! \n", "5 ham As per your request 'Melle Melle (Oru Minnamin... \n", "6 spam WINNER!! As a valued network customer you have... \n", "7 spam Had your mobile 11 months or more? U R entitle... \n", "8 ham I'm gonna be home soon and i don't want to tal... \n", "9 spam SIX chances to win CASH! From 100 to 20,000 po... \n", "\n", " body_text_clean \\\n", "0 Ive been searching for the right words to than... \n", "1 Free entry in 2 a wkly comp to win FA Cup fina... \n", "2 Nah I dont think he goes to usf he lives aroun... \n", "3 Even my brother is not like to speak with me T... \n", "4 I HAVE A DATE ON SUNDAY WITH WILL \n", "5 As per your request Melle Melle Oru Minnaminun... \n", "6 WINNER As a valued network customer you have b... \n", "7 Had your mobile 11 months or more U R entitled... \n", "8 Im gonna be home soon and i dont want to talk ... \n", "9 SIX chances to win CASH From 100 to 20000 poun... \n", "\n", " body_text_tokenized \\\n", "0 [ive, been, searching, for, the, right, words,... \n", "1 [free, entry, in, 2, a, wkly, comp, to, win, f... \n", "2 [nah, i, dont, think, he, goes, to, usf, he, l... \n", "3 [even, my, brother, is, not, like, to, speak, ... \n", "4 [i, have, a, date, on, sunday, with, will] \n", "5 [as, per, your, request, melle, melle, oru, mi... \n", "6 [winner, as, a, valued, network, customer, you... \n", "7 [had, your, mobile, 11, months, or, more, u, r... \n", "8 [im, gonna, be, home, soon, and, i, dont, want... \n", "9 [six, chances, to, win, cash, from, 100, to, 2... \n", "\n", " body_text_nostop \\\n", "0 [ive, searching, right, words, thank, breather... \n", "1 [free, entry, 2, wkly, comp, win, fa, cup, fin... \n", "2 [nah, dont, think, goes, usf, lives, around, t... \n", "3 [even, brother, like, speak, treat, like, aids... \n", "4 [date, sunday] \n", "5 [per, request, melle, melle, oru, minnaminungi... \n", "6 [winner, valued, network, customer, selected, ... \n", "7 [mobile, 11, months, u, r, entitled, update, l... \n", "8 [im, gonna, home, soon, dont, want, talk, stuf... \n", "9 [six, chances, win, cash, 100, 20000, pounds, ... \n", "\n", " body_text_stemmed \\\n", "0 [ive, search, right, word, thank, breather, pr... \n", "1 [free, entri, 2, wkli, comp, win, fa, cup, fin... \n", "2 [nah, dont, think, goe, usf, live, around, tho... \n", "3 [even, brother, like, speak, treat, like, aid,... \n", "4 [date, sunday] \n", "5 [per, request, mell, mell, oru, minnaminungint... \n", "6 [winner, valu, network, custom, select, receiv... \n", "7 [mobil, 11, month, u, r, entitl, updat, latest... \n", "8 [im, gonna, home, soon, dont, want, talk, stuf... \n", "9 [six, chanc, win, cash, 100, 20000, pound, txt... \n", "\n", " body_text_lemmatized \n", "0 [ive, searching, right, word, thank, breather,... \n", "1 [free, entry, 2, wkly, comp, win, fa, cup, fin... \n", "2 [nah, dont, think, go, usf, life, around, though] \n", "3 [even, brother, like, speak, treat, like, aid,... \n", "4 [date, sunday] \n", "5 [per, request, melle, melle, oru, minnaminungi... \n", "6 [winner, valued, network, customer, selected, ... \n", "7 [mobile, 11, month, u, r, entitled, update, la... \n", "8 [im, gonna, home, soon, dont, want, talk, stuf... \n", "9 [six, chance, win, cash, 100, 20000, pound, tx... " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn = nltk.WordNetLemmatizer()\n", "\n", "def lemmatizing(tokenized_text):\n", " text = [wn.lemmatize(word) for word in tokenized_text]\n", " return text\n", "\n", "data['body_text_lemmatized'] = data['body_text_nostop'].apply(lambda x: lemmatizing(x))\n", "\n", "data.head(10)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }