Created
November 6, 2021 11:07
-
-
Save BadreeshShetty/fdd7706e4c7b1553b00b2d7cb00120de to your computer and use it in GitHub Desktop.
NLP ML (Built-In)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Create feature for text message length and % of punctuation in text " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>label</th>\n", | |
" <th>body_text</th>\n", | |
" <th>body_len</th>\n", | |
" <th>punct%</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>spam</td>\n", | |
" <td>Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...</td>\n", | |
" <td>128</td>\n", | |
" <td>4.7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>ham</td>\n", | |
" <td>Nah I don't think he goes to usf, he lives around here though</td>\n", | |
" <td>49</td>\n", | |
" <td>4.1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>ham</td>\n", | |
" <td>Even my brother is not like to speak with me. They treat me like aids patent.</td>\n", | |
" <td>62</td>\n", | |
" <td>3.2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>ham</td>\n", | |
" <td>I HAVE A DATE ON SUNDAY WITH WILL!!</td>\n", | |
" <td>28</td>\n", | |
" <td>7.1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>ham</td>\n", | |
" <td>As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...</td>\n", | |
" <td>135</td>\n", | |
" <td>4.4</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" label \\\n", | |
"0 spam \n", | |
"1 ham \n", | |
"2 ham \n", | |
"3 ham \n", | |
"4 ham \n", | |
"\n", | |
" body_text \\\n", | |
"0 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ... \n", | |
"1 Nah I don't think he goes to usf, he lives around here though \n", | |
"2 Even my brother is not like to speak with me. They treat me like aids patent. \n", | |
"3 I HAVE A DATE ON SUNDAY WITH WILL!! \n", | |
"4 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call... \n", | |
"\n", | |
" body_len punct% \n", | |
"0 128 4.7 \n", | |
"1 49 4.1 \n", | |
"2 62 3.2 \n", | |
"3 28 7.1 \n", | |
"4 135 4.4 " | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import string\n", | |
"\n", | |
"# Function to calculate length of message excluding space\n", | |
"data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(\" \"))\n", | |
"\n", | |
"data.head()\n", | |
"\n", | |
"def count_punct(text):\n", | |
" count = sum([1 for char in text if char in string.punctuation])\n", | |
" return round(count/(len(text) - text.count(\" \")), 3)*100\n", | |
"\n", | |
"data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))\n", | |
"\n", | |
"data.head()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.2" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": false, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment