Skip to content

Instantly share code, notes, and snippets.

@indiranell
Created August 28, 2023 10:31
Show Gist options
  • Save indiranell/dbd94c4bfa6fa50b83199914c6996660 to your computer and use it in GitHub Desktop.
Save indiranell/dbd94c4bfa6fa50b83199914c6996660 to your computer and use it in GitHub Desktop.
Pto classifier
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 103,
"id": "cdf70556",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import SnowballStemmer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_selection import SelectKBest, chi2\n",
"from deepchecks.nlp import TextData\n",
"from deepchecks.nlp.suites import full_suite\n",
"from deepchecks.nlp.suites import data_integrity\n",
"from deepchecks.nlp.suites import train_test_validation\n",
"from deepchecks.nlp.suites import model_evaluation\n",
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "7185dc88",
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('pto_messages.csv', names=['sentence', 'label'])\n",
"data = data.sample(frac=1).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "3823aa3f",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentence</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>I m still not feeling wellfever is gone but se...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Will be taking the day off as Im not feeling well</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>so yesterday itself you thought about it</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>I have some emergency work I have to go now</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>taking</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sentence label\n",
"0 I m still not feeling wellfever is gone but se... 1\n",
"1 Will be taking the day off as Im not feeling well 1\n",
"2 so yesterday itself you thought about it 0\n",
"3 I have some emergency work I have to go now 0\n",
"4 taking 0"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"id": "d2607269",
"metadata": {},
"outputs": [],
"source": [
"#Pre-processing\n",
"stemmer = SnowballStemmer('english')\n",
"words = stopwords.words(\"english\")\n",
"NOT_STOP_WORDS = ['not','off','be','will','before','after','out']\n",
"ADD_STOP_WORDS = ['today', 'tomorrow', 'yesterday']\n",
"for word in NOT_STOP_WORDS:\n",
" words.remove(word)\n",
"for word in ADD_STOP_WORDS:\n",
" words.append(word)\n",
"#data['cleaned'] = data['sentence'].apply(lambda x: \" \".join([stemmer.stem(i) for i in re.sub(\"[^a-zA-Z]\", \" \", x).split() if i not in words]).lower())\n",
"X_train, X_test, y_train, y_test = train_test_split(data['sentence'], data.label, test_size=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"id": "17455c58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8974358974358975\n"
]
}
],
"source": [
"pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 4), stop_words=words, sublinear_tf=True)),\n",
" ('chi', SelectKBest(chi2, k=1000)), \n",
" ('rfc',RandomForestClassifier(n_estimators=100))])\n",
"\n",
"model = pipeline.fit(X_train, y_train)\n",
"accuracy_score = model.score(X_test, y_test)\n",
"\n",
"print (accuracy_score)"
]
},
{
"cell_type": "code",
"execution_count": 135,
"id": "20fa2e9d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <style>\n",
" progress {\n",
" -webkit-appearance: none;\n",
" border: none;\n",
" border-radius: 3px;\n",
" width: 300px;\n",
" height: 20px;\n",
" vertical-align: middle;\n",
" margin-right: 10px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-bar {\n",
" border-radius: 3px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-value {\n",
" background-color: #9d60fb;\n",
" }\n",
" progress::-moz-progress-bar {\n",
" background-color: #9d60fb;\n",
" }\n",
" </style>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cff84a605b43477eba59256ffafe097f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Accordion(children=(VBox(children=(HTML(value='\\n<h1 id=\"summary_GBN0PKFT2TNG60JQ4BMWMFXGM\">Data Integrity Sui…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#Data integrity\n",
"train = TextData(X_train, label=y_train, task_type='text_classification')\n",
"test = TextData(X_test, label=y_test, task_type='text_classification')\n",
"data_integrity_suite = data_integrity()\n",
"\n",
"data_integrity_suite.run(train, test)\n"
]
},
{
"cell_type": "code",
"execution_count": 136,
"id": "2344dcb6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <style>\n",
" progress {\n",
" -webkit-appearance: none;\n",
" border: none;\n",
" border-radius: 3px;\n",
" width: 300px;\n",
" height: 20px;\n",
" vertical-align: middle;\n",
" margin-right: 10px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-bar {\n",
" border-radius: 3px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-value {\n",
" background-color: #9d60fb;\n",
" }\n",
" progress::-moz-progress-bar {\n",
" background-color: #9d60fb;\n",
" }\n",
" </style>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": []
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"deepchecks - WARNING - Could not find model's classes, using the observed classes. In order to make sure the classes used by the model are inferred correctly, please use the model_classes argument\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4f3fe89150bb402b98c3903977bb3af1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Accordion(children=(VBox(children=(HTML(value='\\n<h1 id=\"summary_41IT4B12ZYS9KQAEQXYMKP74C\">Train Test Validat…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#Train Test Evaluation\n",
"train_test_validation().run(train, test)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"id": "e30d6caf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"deepchecks - WARNING - Could not find model's classes, using the observed classes. In order to make sure the classes used by the model are inferred correctly, please use the model_classes argument\n"
]
},
{
"data": {
"text/html": [
"\n",
" <style>\n",
" progress {\n",
" -webkit-appearance: none;\n",
" border: none;\n",
" border-radius: 3px;\n",
" width: 300px;\n",
" height: 20px;\n",
" vertical-align: middle;\n",
" margin-right: 10px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-bar {\n",
" border-radius: 3px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-value {\n",
" background-color: #9d60fb;\n",
" }\n",
" progress::-moz-progress-bar {\n",
" background-color: #9d60fb;\n",
" }\n",
" </style>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fc385d09711f4ce8b083f53c9b187b14",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Accordion(children=(VBox(children=(HTML(value='\\n<h1 id=\"summary_38JMFB4AMHRC5JGA59KUAYPGM\">Model Evaluation S…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"train_preds = model.predict(X_train) # Get predicted labels (0 or 1)\n",
"train_probs = model.predict_proba(X_train) # Get predicted probabilities for positive class (class 1)\n",
"test_preds = model.predict(X_test) # Get predicted labels (0 or 1)\n",
"test_probs = model.predict_proba(X_test) # Get predicted probabilities for positive class (class 1)\n",
"\n",
"model_evaluation().run(train, test, \n",
" train_predictions=train_preds,\n",
" test_predictions=test_preds,\n",
" train_probabilities=train_probs,\n",
" test_probabilities=test_probs)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c535b6c4",
"metadata": {},
"outputs": [],
"source": [
"suite = full_suite()\n",
"suite.run(train_dataset=train,\n",
" test_dataset=test,\n",
" with_display=True,\n",
" train_predictions=train_preds,\n",
" test_predictions=test_preds,\n",
" train_probabilities=train_probs,\n",
" test_probabilities=test_probs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment