Skip to content

Instantly share code, notes, and snippets.

@indiranell
Last active August 25, 2023 11:58
Show Gist options
  • Save indiranell/6eeaf2459e744136af50db9569a23ee8 to your computer and use it in GitHub Desktop.
Save indiranell/6eeaf2459e744136af50db9569a23ee8 to your computer and use it in GitHub Desktop.
Deepchecks pto detector
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 251,
"id": "cdf70556",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import SnowballStemmer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_selection import SelectKBest, chi2\n",
"from deepchecks.nlp import TextData\n",
"from deepchecks.nlp.suites import full_suite\n",
"from deepchecks.nlp.suites import data_integrity\n",
"from deepchecks.nlp.suites import train_test_validation\n",
"from deepchecks.nlp.suites import model_evaluation\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.linear_model import LogisticRegression"
]
},
{
"cell_type": "code",
"execution_count": 226,
"id": "7185dc88",
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('pto_messages.csv', names=['sentence', 'label'])\n",
"data = data.sample(frac=1).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 219,
"id": "3823aa3f",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentence</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>My work isnt completed yet in college so will ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Sorry to hear that Take care Shiva</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Down with Flu cant make it today</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>My dental appointment is postponed to tomorrow...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>My pto tomorrow has been cancelled</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sentence label\n",
"0 My work isnt completed yet in college so will ... 1\n",
"1 Sorry to hear that Take care Shiva 0\n",
"2 Down with Flu cant make it today 1\n",
"3 My dental appointment is postponed to tomorrow... 1\n",
"4 My pto tomorrow has been cancelled 0"
]
},
"execution_count": 219,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 227,
"id": "d2607269",
"metadata": {},
"outputs": [],
"source": [
"#Pre-processing\n",
"stemmer = SnowballStemmer('english')\n",
"words = stopwords.words(\"english\")\n",
"NOT_STOP_WORDS = ['not','off','be','will','before','after','out']\n",
"ADD_STOP_WORDS = ['today', 'tomorrow', 'yesterday']\n",
"for word in NOT_STOP_WORDS:\n",
" words.remove(word)\n",
"for word in ADD_STOP_WORDS:\n",
" words.append(word)\n",
"#data['cleaned'] = data['sentence'].apply(lambda x: \" \".join([stemmer.stem(i) for i in re.sub(\"[^a-zA-Z]\", \" \", x).split() if i not in words]).lower())\n",
"X_train, X_test, y_train, y_test = train_test_split(data['sentence'], data.label, test_size=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 253,
"id": "17455c58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8726114649681529\n"
]
}
],
"source": [
"pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 4), stop_words=words, sublinear_tf=True)),\n",
" ('chi', SelectKBest(chi2, k=1000)), \n",
" ('lgr', LogisticRegression(C=1.0, penalty='l2', max_iter=1000))])\n",
" #('rfc',RandomForestClassifier(n_estimators=100))])\n",
"\n",
"model = pipeline.fit(X_train, y_train)\n",
"accuracy_score = model.score(X_test, y_test)\n",
"\n",
"print (accuracy_score)"
]
},
{
"cell_type": "code",
"execution_count": 255,
"id": "20fa2e9d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <style>\n",
" progress {\n",
" -webkit-appearance: none;\n",
" border: none;\n",
" border-radius: 3px;\n",
" width: 300px;\n",
" height: 20px;\n",
" vertical-align: middle;\n",
" margin-right: 10px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-bar {\n",
" border-radius: 3px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-value {\n",
" background-color: #9d60fb;\n",
" }\n",
" progress::-moz-progress-bar {\n",
" background-color: #9d60fb;\n",
" }\n",
" </style>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "938c172f693b479584f2e6605aa19333",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Accordion(children=(VBox(children=(HTML(value='\\n<h1 id=\"summary_MJEOJ14ZBPX1IE690ZW2D539V\">Data Integrity Sui…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#Data integrity\n",
"train = TextData(X_train, label=y_train, task_type='text_classification')\n",
"test = TextData(X_test, label=y_test, task_type='text_classification')\n",
"data_integrity_suite = data_integrity()\n",
"\n",
"data_integrity_suite.run(train, test)\n"
]
},
{
"cell_type": "code",
"execution_count": 257,
"id": "2344dcb6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <style>\n",
" progress {\n",
" -webkit-appearance: none;\n",
" border: none;\n",
" border-radius: 3px;\n",
" width: 300px;\n",
" height: 20px;\n",
" vertical-align: middle;\n",
" margin-right: 10px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-bar {\n",
" border-radius: 3px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-value {\n",
" background-color: #9d60fb;\n",
" }\n",
" progress::-moz-progress-bar {\n",
" background-color: #9d60fb;\n",
" }\n",
" </style>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": []
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"deepchecks - WARNING - Could not find model's classes, using the observed classes. In order to make sure the classes used by the model are inferred correctly, please use the model_classes argument\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "266fbe3b6400468e94a7333a379af81b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Accordion(children=(VBox(children=(HTML(value='\\n<h1 id=\"summary_NF14E7GML76C67VHMEDOQS00D\">Train Test Validat…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#Train Test Evaluation\n",
"train_test_validation().run(train, test)"
]
},
{
"cell_type": "code",
"execution_count": 254,
"id": "e30d6caf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"deepchecks - WARNING - Could not find model's classes, using the observed classes. In order to make sure the classes used by the model are inferred correctly, please use the model_classes argument\n"
]
},
{
"data": {
"text/html": [
"\n",
" <style>\n",
" progress {\n",
" -webkit-appearance: none;\n",
" border: none;\n",
" border-radius: 3px;\n",
" width: 300px;\n",
" height: 20px;\n",
" vertical-align: middle;\n",
" margin-right: 10px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-bar {\n",
" border-radius: 3px;\n",
" background-color: aliceblue;\n",
" }\n",
" progress::-webkit-progress-value {\n",
" background-color: #9d60fb;\n",
" }\n",
" progress::-moz-progress-bar {\n",
" background-color: #9d60fb;\n",
" }\n",
" </style>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "00d861061f5a435abd48a36ecbb0f6d0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Accordion(children=(VBox(children=(HTML(value='\\n<h1 id=\"summary_NRX7VYKUW92IZ19B7BF7CCGWE\">Model Evaluation S…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"accuracy_score = model.score(X_test, y_test)\n",
"train_preds = model.predict(X_train) # Get predicted labels (0 or 1)\n",
"train_probs = model.predict_proba(X_train) # Get predicted probabilities for positive class (class 1)\n",
"test_preds = model.predict(X_test) # Get predicted labels (0 or 1)\n",
"test_probs = model.predict_proba(X_test) # Get predicted probabilities for positive class (class 1)\n",
"\n",
"model_evaluation().run(train, test, \n",
" train_predictions=train_preds,\n",
" test_predictions=test_preds,\n",
" train_probabilities=train_probs,\n",
" test_probabilities=test_probs) \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c535b6c4",
"metadata": {},
"outputs": [],
"source": [
"suite = full_suite()\n",
"suite.run(train_dataset=train,\n",
" test_dataset=test,\n",
" with_display=True,\n",
" train_predictions=train_preds,\n",
" test_predictions=test_preds,\n",
" train_probabilities=train_probs,\n",
" test_probabilities=test_probs)"
]
},
{
"cell_type": "code",
"execution_count": 205,
"id": "c7450be0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"deepchecks - WARNING - Could not find model's classes, using the observed classes. In order to make sure the classes used by the model are inferred correctly, please use the model_classes argument\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "380e83b027ba4057b639ed62c0144496",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='<h4><b>Confusion Matrix Report</b></h4>'), HTML(value='<p>Calculate the confusion m…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from deepchecks.nlp.checks import ConfusionMatrixReport\n",
"check = ConfusionMatrixReport()\n",
"result = check.run(train, predictions=train_preds)\n",
"result.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06bf6904",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9878494",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4350888e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1f4a97de",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment