Skip to content

Instantly share code, notes, and snippets.

@BadreeshShetty
Created November 6, 2021 11:06

Revisions

  1. BadreeshShetty created this gist Nov 6, 2021.
    307 changes: 307 additions & 0 deletions 6 Lemmatizing.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,307 @@
    {
    "cells": [
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## Lemmatizing"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {
    "scrolled": false
    },
    "outputs": [
    {
    "data": {
    "text/html": [
    "<div>\n",
    "<style scoped>\n",
    " .dataframe tbody tr th:only-of-type {\n",
    " vertical-align: middle;\n",
    " }\n",
    "\n",
    " .dataframe tbody tr th {\n",
    " vertical-align: top;\n",
    " }\n",
    "\n",
    " .dataframe thead th {\n",
    " text-align: right;\n",
    " }\n",
    "</style>\n",
    "<table border=\"1\" class=\"dataframe\">\n",
    " <thead>\n",
    " <tr style=\"text-align: right;\">\n",
    " <th></th>\n",
    " <th>label</th>\n",
    " <th>body_text</th>\n",
    " <th>body_text_clean</th>\n",
    " <th>body_text_tokenized</th>\n",
    " <th>body_text_nostop</th>\n",
    " <th>body_text_stemmed</th>\n",
    " <th>body_text_lemmatized</th>\n",
    " </tr>\n",
    " </thead>\n",
    " <tbody>\n",
    " <tr>\n",
    " <th>0</th>\n",
    " <td>ham</td>\n",
    " <td>I've been searching for the right words to tha...</td>\n",
    " <td>Ive been searching for the right words to than...</td>\n",
    " <td>[ive, been, searching, for, the, right, words,...</td>\n",
    " <td>[ive, searching, right, words, thank, breather...</td>\n",
    " <td>[ive, search, right, word, thank, breather, pr...</td>\n",
    " <td>[ive, searching, right, word, thank, breather,...</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>1</th>\n",
    " <td>spam</td>\n",
    " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
    " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
    " <td>[free, entry, in, 2, a, wkly, comp, to, win, f...</td>\n",
    " <td>[free, entry, 2, wkly, comp, win, fa, cup, fin...</td>\n",
    " <td>[free, entri, 2, wkli, comp, win, fa, cup, fin...</td>\n",
    " <td>[free, entry, 2, wkly, comp, win, fa, cup, fin...</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>2</th>\n",
    " <td>ham</td>\n",
    " <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
    " <td>Nah I dont think he goes to usf he lives aroun...</td>\n",
    " <td>[nah, i, dont, think, he, goes, to, usf, he, l...</td>\n",
    " <td>[nah, dont, think, goes, usf, lives, around, t...</td>\n",
    " <td>[nah, dont, think, goe, usf, live, around, tho...</td>\n",
    " <td>[nah, dont, think, go, usf, life, around, though]</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>3</th>\n",
    " <td>ham</td>\n",
    " <td>Even my brother is not like to speak with me. ...</td>\n",
    " <td>Even my brother is not like to speak with me T...</td>\n",
    " <td>[even, my, brother, is, not, like, to, speak, ...</td>\n",
    " <td>[even, brother, like, speak, treat, like, aids...</td>\n",
    " <td>[even, brother, like, speak, treat, like, aid,...</td>\n",
    " <td>[even, brother, like, speak, treat, like, aid,...</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>4</th>\n",
    " <td>ham</td>\n",
    " <td>I HAVE A DATE ON SUNDAY WITH WILL!!</td>\n",
    " <td>I HAVE A DATE ON SUNDAY WITH WILL</td>\n",
    " <td>[i, have, a, date, on, sunday, with, will]</td>\n",
    " <td>[date, sunday]</td>\n",
    " <td>[date, sunday]</td>\n",
    " <td>[date, sunday]</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>5</th>\n",
    " <td>ham</td>\n",
    " <td>As per your request 'Melle Melle (Oru Minnamin...</td>\n",
    " <td>As per your request Melle Melle Oru Minnaminun...</td>\n",
    " <td>[as, per, your, request, melle, melle, oru, mi...</td>\n",
    " <td>[per, request, melle, melle, oru, minnaminungi...</td>\n",
    " <td>[per, request, mell, mell, oru, minnaminungint...</td>\n",
    " <td>[per, request, melle, melle, oru, minnaminungi...</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>6</th>\n",
    " <td>spam</td>\n",
    " <td>WINNER!! As a valued network customer you have...</td>\n",
    " <td>WINNER As a valued network customer you have b...</td>\n",
    " <td>[winner, as, a, valued, network, customer, you...</td>\n",
    " <td>[winner, valued, network, customer, selected, ...</td>\n",
    " <td>[winner, valu, network, custom, select, receiv...</td>\n",
    " <td>[winner, valued, network, customer, selected, ...</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>7</th>\n",
    " <td>spam</td>\n",
    " <td>Had your mobile 11 months or more? U R entitle...</td>\n",
    " <td>Had your mobile 11 months or more U R entitled...</td>\n",
    " <td>[had, your, mobile, 11, months, or, more, u, r...</td>\n",
    " <td>[mobile, 11, months, u, r, entitled, update, l...</td>\n",
    " <td>[mobil, 11, month, u, r, entitl, updat, latest...</td>\n",
    " <td>[mobile, 11, month, u, r, entitled, update, la...</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>8</th>\n",
    " <td>ham</td>\n",
    " <td>I'm gonna be home soon and i don't want to tal...</td>\n",
    " <td>Im gonna be home soon and i dont want to talk ...</td>\n",
    " <td>[im, gonna, be, home, soon, and, i, dont, want...</td>\n",
    " <td>[im, gonna, home, soon, dont, want, talk, stuf...</td>\n",
    " <td>[im, gonna, home, soon, dont, want, talk, stuf...</td>\n",
    " <td>[im, gonna, home, soon, dont, want, talk, stuf...</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>9</th>\n",
    " <td>spam</td>\n",
    " <td>SIX chances to win CASH! From 100 to 20,000 po...</td>\n",
    " <td>SIX chances to win CASH From 100 to 20000 poun...</td>\n",
    " <td>[six, chances, to, win, cash, from, 100, to, 2...</td>\n",
    " <td>[six, chances, win, cash, 100, 20000, pounds, ...</td>\n",
    " <td>[six, chanc, win, cash, 100, 20000, pound, txt...</td>\n",
    " <td>[six, chance, win, cash, 100, 20000, pound, tx...</td>\n",
    " </tr>\n",
    " </tbody>\n",
    "</table>\n",
    "</div>"
    ],
    "text/plain": [
    " label body_text \\\n",
    "0 ham I've been searching for the right words to tha... \n",
    "1 spam Free entry in 2 a wkly comp to win FA Cup fina... \n",
    "2 ham Nah I don't think he goes to usf, he lives aro... \n",
    "3 ham Even my brother is not like to speak with me. ... \n",
    "4 ham I HAVE A DATE ON SUNDAY WITH WILL!! \n",
    "5 ham As per your request 'Melle Melle (Oru Minnamin... \n",
    "6 spam WINNER!! As a valued network customer you have... \n",
    "7 spam Had your mobile 11 months or more? U R entitle... \n",
    "8 ham I'm gonna be home soon and i don't want to tal... \n",
    "9 spam SIX chances to win CASH! From 100 to 20,000 po... \n",
    "\n",
    " body_text_clean \\\n",
    "0 Ive been searching for the right words to than... \n",
    "1 Free entry in 2 a wkly comp to win FA Cup fina... \n",
    "2 Nah I dont think he goes to usf he lives aroun... \n",
    "3 Even my brother is not like to speak with me T... \n",
    "4 I HAVE A DATE ON SUNDAY WITH WILL \n",
    "5 As per your request Melle Melle Oru Minnaminun... \n",
    "6 WINNER As a valued network customer you have b... \n",
    "7 Had your mobile 11 months or more U R entitled... \n",
    "8 Im gonna be home soon and i dont want to talk ... \n",
    "9 SIX chances to win CASH From 100 to 20000 poun... \n",
    "\n",
    " body_text_tokenized \\\n",
    "0 [ive, been, searching, for, the, right, words,... \n",
    "1 [free, entry, in, 2, a, wkly, comp, to, win, f... \n",
    "2 [nah, i, dont, think, he, goes, to, usf, he, l... \n",
    "3 [even, my, brother, is, not, like, to, speak, ... \n",
    "4 [i, have, a, date, on, sunday, with, will] \n",
    "5 [as, per, your, request, melle, melle, oru, mi... \n",
    "6 [winner, as, a, valued, network, customer, you... \n",
    "7 [had, your, mobile, 11, months, or, more, u, r... \n",
    "8 [im, gonna, be, home, soon, and, i, dont, want... \n",
    "9 [six, chances, to, win, cash, from, 100, to, 2... \n",
    "\n",
    " body_text_nostop \\\n",
    "0 [ive, searching, right, words, thank, breather... \n",
    "1 [free, entry, 2, wkly, comp, win, fa, cup, fin... \n",
    "2 [nah, dont, think, goes, usf, lives, around, t... \n",
    "3 [even, brother, like, speak, treat, like, aids... \n",
    "4 [date, sunday] \n",
    "5 [per, request, melle, melle, oru, minnaminungi... \n",
    "6 [winner, valued, network, customer, selected, ... \n",
    "7 [mobile, 11, months, u, r, entitled, update, l... \n",
    "8 [im, gonna, home, soon, dont, want, talk, stuf... \n",
    "9 [six, chances, win, cash, 100, 20000, pounds, ... \n",
    "\n",
    " body_text_stemmed \\\n",
    "0 [ive, search, right, word, thank, breather, pr... \n",
    "1 [free, entri, 2, wkli, comp, win, fa, cup, fin... \n",
    "2 [nah, dont, think, goe, usf, live, around, tho... \n",
    "3 [even, brother, like, speak, treat, like, aid,... \n",
    "4 [date, sunday] \n",
    "5 [per, request, mell, mell, oru, minnaminungint... \n",
    "6 [winner, valu, network, custom, select, receiv... \n",
    "7 [mobil, 11, month, u, r, entitl, updat, latest... \n",
    "8 [im, gonna, home, soon, dont, want, talk, stuf... \n",
    "9 [six, chanc, win, cash, 100, 20000, pound, txt... \n",
    "\n",
    " body_text_lemmatized \n",
    "0 [ive, searching, right, word, thank, breather,... \n",
    "1 [free, entry, 2, wkly, comp, win, fa, cup, fin... \n",
    "2 [nah, dont, think, go, usf, life, around, though] \n",
    "3 [even, brother, like, speak, treat, like, aid,... \n",
    "4 [date, sunday] \n",
    "5 [per, request, melle, melle, oru, minnaminungi... \n",
    "6 [winner, valued, network, customer, selected, ... \n",
    "7 [mobile, 11, month, u, r, entitled, update, la... \n",
    "8 [im, gonna, home, soon, dont, want, talk, stuf... \n",
    "9 [six, chance, win, cash, 100, 20000, pound, tx... "
    ]
    },
    "execution_count": 13,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "wn = nltk.WordNetLemmatizer()\n",
    "\n",
    "def lemmatizing(tokenized_text):\n",
    " text = [wn.lemmatize(word) for word in tokenized_text]\n",
    " return text\n",
    "\n",
    "data['body_text_lemmatized'] = data['body_text_nostop'].apply(lambda x: lemmatizing(x))\n",
    "\n",
    "data.head(10)"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.2"
    },
    "toc": {
    "base_numbering": 1,
    "nav_menu": {},
    "number_sections": false,
    "sideBar": true,
    "skip_h1_title": false,
    "title_cell": "Table of Contents",
    "title_sidebar": "Contents",
    "toc_cell": false,
    "toc_position": {},
    "toc_section_display": true,
    "toc_window_display": false
    },
    "varInspector": {
    "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
    },
    "kernels_config": {
    "python": {
    "delete_cmd_postfix": "",
    "delete_cmd_prefix": "del ",
    "library": "var_list.py",
    "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
    "delete_cmd_postfix": ") ",
    "delete_cmd_prefix": "rm(",
    "library": "var_list.r",
    "varRefreshCmd": "cat(var_dic_list()) "
    }
    },
    "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
    ],
    "window_display": false
    }
    },
    "nbformat": 4,
    "nbformat_minor": 2
    }