Skip to content

Instantly share code, notes, and snippets.

@lewiuberg
Last active September 4, 2020 07:52
Show Gist options
  • Save lewiuberg/e7f8ad2a436bb47560319097f101407a to your computer and use it in GitHub Desktop.
Save lewiuberg/e7f8ad2a436bb47560319097f101407a to your computer and use it in GitHub Desktop.
nuc_machine_learning/Session 4/tutorial_4.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Activity sheet 4\n# Classification - KNN\n\n**Objective**\nThe aim of this activity sheet is to load a customer dataset, 4it the data, and use KNN to predict a data point.\n\n**Dataset description**\nA telecommunication company has categorised its customers into four groups depending upon their service usage. The prime business interest is to customise the product offers for its valuable customers. This is a classification problem where using the historical dataset a classifier is supposed to be designed, which can be used to predict the class of an unknown case. The target field is ‘custcat’. You are required to use the KNN classification for this very dataset."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import sys\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom matplotlib.ticker import FuncFormatter\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix\n\n# %matplotlib notebook\n%matplotlib inline\n\nsns.set(style=\"whitegrid\")",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def unpack_list(lst): # Oxford comma\n if not isinstance(lst, str):\n lst = [str(item) for item in lst]\n if len(lst) == 0:\n return\n if len(lst) == 1:\n return \", \".join(lst)\n if len(lst) == 2:\n return \", and \".join(lst) \n else:\n first_part = lst[:-1]\n last_part = lst[-1]\n return \", \".join(first_part) + \", and \" + last_part",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def word_search(df, *words):\n words = [word for word in words]\n sum_words: int = 0\n found_words: str = []\n list_of_words: list = []\n\n if not words:\n return\n\n for word in words:\n col_count = 0\n sum_word = 0\n for column in df:\n if df[column].dtype == object or df[column].dtype == str:\n col_count += 1\n sum_word += df[column].str.contains(f\"^{word}$\").sum()\n if df[column].str.contains(f\"^{word}$\").any():\n if word not in found_words:\n found_words.append(word)\n sum_words += sum_word\n if len(found_words) == 0:\n found_words = words\n print(\"Columns of dtype str or object:\", col_count)\n print(f\"Instances of {unpack_list(found_words)} in the dataframe: {sum_words}\")",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df = pd.read_csv(\"teleCust1000t.csv\")",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"scrolled": true,
"trusted": true
},
"cell_type": "code",
"source": "df.head()",
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 5,
"data": {
"text/plain": " region tenure age marital address income ed employ retire gender \\\n0 2 13 44 1 9 64.0 4 5 0.0 0 \n1 3 11 33 1 7 136.0 5 5 0.0 0 \n2 3 68 52 1 24 116.0 1 29 0.0 1 \n3 2 33 33 0 12 33.0 2 0 0.0 1 \n4 2 23 30 1 9 30.0 1 2 0.0 0 \n\n reside custcat \n0 2 1 \n1 6 4 \n2 2 3 \n3 1 1 \n4 4 3 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>region</th>\n <th>tenure</th>\n <th>age</th>\n <th>marital</th>\n <th>address</th>\n <th>income</th>\n <th>ed</th>\n <th>employ</th>\n <th>retire</th>\n <th>gender</th>\n <th>reside</th>\n <th>custcat</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>2</td>\n <td>13</td>\n <td>44</td>\n <td>1</td>\n <td>9</td>\n <td>64.0</td>\n <td>4</td>\n <td>5</td>\n <td>0.0</td>\n <td>0</td>\n <td>2</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>3</td>\n <td>11</td>\n <td>33</td>\n <td>1</td>\n <td>7</td>\n <td>136.0</td>\n <td>5</td>\n <td>5</td>\n <td>0.0</td>\n <td>0</td>\n <td>6</td>\n <td>4</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>68</td>\n <td>52</td>\n <td>1</td>\n <td>24</td>\n <td>116.0</td>\n <td>1</td>\n <td>29</td>\n <td>0.0</td>\n <td>1</td>\n <td>2</td>\n <td>3</td>\n </tr>\n <tr>\n <th>3</th>\n <td>2</td>\n <td>33</td>\n <td>33</td>\n <td>0</td>\n <td>12</td>\n <td>33.0</td>\n <td>2</td>\n <td>0</td>\n <td>0.0</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>2</td>\n <td>23</td>\n <td>30</td>\n <td>1</td>\n <td>9</td>\n <td>30.0</td>\n <td>1</td>\n <td>2</td>\n <td>0.0</td>\n <td>0</td>\n <td>4</td>\n <td>3</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"scrolled": true,
"trusted": true
},
"cell_type": "code",
"source": "df.info()",
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 1000 entries, 0 to 999\nData columns (total 12 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 region 1000 non-null int64 \n 1 tenure 1000 non-null int64 \n 2 age 1000 non-null int64 \n 3 marital 1000 non-null int64 \n 4 address 1000 non-null int64 \n 5 income 1000 non-null float64\n 6 ed 1000 non-null int64 \n 7 employ 1000 non-null int64 \n 8 retire 1000 non-null float64\n 9 gender 1000 non-null int64 \n 10 reside 1000 non-null int64 \n 11 custcat 1000 non-null int64 \ndtypes: float64(2), int64(10)\nmemory usage: 93.9 KB\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(f\"Columns with missing data: {df.isnull().any().sum()}\")\nprint(f\"Instances of missing data: {df.isnull().sum().sum()}\")",
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": "Columns with missing data: 0\nInstances of missing data: 0\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "word_search(df, \"None\", \"none\", \"0\")",
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": "Columns of dtype str or object: 0\nInstances of None, none, and 0 in the dataframe: 0\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Basic data preprocessing\n1. See how many instances of each class is in the given dataset using visualisation techniques."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df[\"custcat\"] = df[\"custcat\"].astype(\"category\")",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df.custcat.value_counts()",
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 10,
"data": {
"text/plain": "3 281\n1 266\n4 236\n2 217\nName: custcat, dtype: int64"
},
"metadata": {}
}
]
},
{
"metadata": {
"scrolled": false,
"trusted": true
},
"cell_type": "code",
"source": "sns.catplot(data=df, kind=\"count\", x=\"custcat\");",
"execution_count": 11,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 360x360 with 1 Axes>",
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "2. Convert Pandas DataFrame to NumPy array"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "X = df.drop(\"custcat\", axis=1).to_numpy()\nprint(X[0:5])",
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": "[[ 2. 13. 44. 1. 9. 64. 4. 5. 0. 0. 2.]\n [ 3. 11. 33. 1. 7. 136. 5. 5. 0. 0. 6.]\n [ 3. 68. 52. 1. 24. 116. 1. 29. 0. 1. 2.]\n [ 2. 33. 33. 0. 12. 33. 2. 0. 0. 1. 1.]\n [ 2. 23. 30. 1. 9. 30. 1. 2. 0. 0. 4.]]\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "3. Define the attribute set and the target value ‘custcat’"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y = df.custcat.values\ny.value_counts()",
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 13,
"data": {
"text/plain": "1 266\n2 217\n3 281\n4 236\ndtype: int64"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "4. Perform z-score standardisation for feature scaling using preprocessing.StandardScaler() from sklearn"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "scaler = StandardScaler()\nscaler",
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 14,
"data": {
"text/plain": "StandardScaler()"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "scaler.fit(X)",
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 15,
"data": {
"text/plain": "StandardScaler()"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "scaler.mean_",
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 16,
"data": {
"text/plain": "array([2.0220e+00, 3.5526e+01, 4.1684e+01, 4.9500e-01, 1.1551e+01,\n 7.7535e+01, 2.6710e+00, 1.0987e+01, 4.7000e-02, 5.1700e-01,\n 2.3310e+00])"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "scaler.transform(X)",
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 17,
"data": {
"text/plain": "array([[-0.02696767, -1.055125 , 0.18450456, ..., -0.22207644,\n -1.03459817, -0.23065004],\n [ 1.19883553, -1.14880563, -0.69181243, ..., -0.22207644,\n -1.03459817, 2.55666158],\n [ 1.19883553, 1.52109247, 0.82182601, ..., -0.22207644,\n 0.96655883, -0.23065004],\n ...,\n [ 1.19883553, 1.47425216, 1.37948227, ..., -0.22207644,\n 0.96655883, -0.92747794],\n [ 1.19883553, 1.61477311, 0.58283046, ..., -0.22207644,\n 0.96655883, -0.92747794],\n [ 1.19883553, 0.67796676, -0.45281689, ..., -0.22207644,\n 0.96655883, 0.46617787]])"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# All in one go.\nX_scale = scaler.fit_transform(X)\nX_scale ",
"execution_count": 18,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 18,
"data": {
"text/plain": "array([[-0.02696767, -1.055125 , 0.18450456, ..., -0.22207644,\n -1.03459817, -0.23065004],\n [ 1.19883553, -1.14880563, -0.69181243, ..., -0.22207644,\n -1.03459817, 2.55666158],\n [ 1.19883553, 1.52109247, 0.82182601, ..., -0.22207644,\n 0.96655883, -0.23065004],\n ...,\n [ 1.19883553, 1.47425216, 1.37948227, ..., -0.22207644,\n 0.96655883, -0.92747794],\n [ 1.19883553, 1.61477311, 0.58283046, ..., -0.22207644,\n 0.96655883, -0.92747794],\n [ 1.19883553, 0.67796676, -0.45281689, ..., -0.22207644,\n 0.96655883, 0.46617787]])"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "5. Using scikit-learn perform training and test data split (80% - 20%)"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "X_train, X_test, y_train, y_test = train_test_split(\n X_scale, y, test_size=0.2, random_state=4)",
"execution_count": 19,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Kick start KNN\n1. Develop a classifier with K = 3 using KNeighborsClassifier() from sklearn.neighbors"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "k = 3",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "neigh = KNeighborsClassifier(n_neighbors=k)",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "neigh",
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 22,
"data": {
"text/plain": "KNeighborsClassifier(n_neighbors=3)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "neigh.get_params()",
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 23,
"data": {
"text/plain": "{'algorithm': 'auto',\n 'leaf_size': 30,\n 'metric': 'minkowski',\n 'metric_params': None,\n 'n_jobs': None,\n 'n_neighbors': 3,\n 'p': 2,\n 'weights': 'uniform'}"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "neigh.fit(X_train, y_train)",
"execution_count": 24,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 24,
"data": {
"text/plain": "KNeighborsClassifier(n_neighbors=3)"
},
"metadata": {}
}
]
},
{
"metadata": {
"scrolled": true,
"trusted": true
},
"cell_type": "code",
"source": "print(f\"Customer class labels: {unpack_list(neigh.classes_)}\")",
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"text": "Customer class labels: 1, 2, 3, and 4\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "2. Check the accuracy of your classifier on the test dataset, using metrics.accuracy_score() from sklearn."
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(\"Train set Accuracy: \", neigh.score(X_train, y_train))",
"execution_count": 26,
"outputs": [
{
"output_type": "stream",
"text": "Train set Accuracy: 0.56875\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Alternative\nprint(\"The fraction of correctly classified samples on the train set:\", accuracy_score(y_train, neigh.predict(X_train), normalize=True))\nprint(\"The number of correctly classified samples on the train set:\", accuracy_score(y_train, neigh.predict(X_train), normalize=False))",
"execution_count": 27,
"outputs": [
{
"output_type": "stream",
"text": "The fraction of correctly classified samples on the train set: 0.56875\nThe number of correctly classified samples on the train set: 455\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y_pred = neigh.predict(X_test)",
"execution_count": 28,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y_pred[0]",
"execution_count": 29,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 29,
"data": {
"text/plain": "1"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y_pred[0:5]",
"execution_count": 30,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 30,
"data": {
"text/plain": "array([1, 3, 1, 4, 4])"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(\"Train set Accuracy: \", neigh.score(X_test, y_test))",
"execution_count": 31,
"outputs": [
{
"output_type": "stream",
"text": "Train set Accuracy: 0.315\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Alternative\nprint(\"The fraction of correctly classified samples on the test set:\", accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))\nprint(\"The number of correctly classified samples on the test set:\", accuracy_score(y_test, y_pred, normalize=False, sample_weight=None))",
"execution_count": 32,
"outputs": [
{
"output_type": "stream",
"text": "The fraction of correctly classified samples on the test set: 0.315\nThe number of correctly classified samples on the test set: 63\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "**Confusion Matrix**"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "![alt text](https://2.bp.blogspot.com/-EvSXDotTOwc/XMfeOGZ-CVI/AAAAAAAAEiE/oePFfvhfOQM11dgRn9FkPxlegCXbgOF4QCLcBGAs/s1600/confusionMatrxiUpdated.jpg)"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(cm :=confusion_matrix(y_test, y_pred))\nprint()\nTP = np.diag(cm)\nprint(\"TP: \", TP)\nFN = cm.sum(axis=1) - np.diag(cm)\nprint(\"FN: \", FN)\nFP = cm.sum(axis=0) - np.diag(cm)\nprint(\"FP: \", FP)\nTN = cm.sum() - (FP + FN + TP)\nprint(\"TN: \", TN)\n\nprint()\n# Sensitivity, hit rate, recall, or true positive rate\nTPR = TP/(TP+FN)\nprint(\"TPR: \", TPR)\n# Specificity or true negative rate\nTNR = TN/(TN+FP) \nprint(\"TNR: \", TNR)\n# Precision or positive predictive value\nPPV = TP/(TP+FP)\nprint(\"PPV: \", PPV)\n# Negative predictive value\nNPV = TN/(TN+FN)\nprint(\"NPV: \", NPV)\n# Fall out or false positive rate\nFPR = FP/(FP+TN)\nprint(\"FPR: \", FPR)\n# False negative rate\nFNR = FN/(TP+FN)\nprint(\"FNR: \", FNR)\n# False discovery rate\nFDR = FP/(TP+FP)\nprint(\"FDR: \", FDR)\n# Overall accuracy\nACC = (TP+TN)/(TP+FP+FN+TN)\nprint(\"ACC: \", ACC)",
"execution_count": 33,
"outputs": [
{
"output_type": "stream",
"text": "[[30 5 12 4]\n [ 9 14 9 12]\n [25 12 11 6]\n [21 14 8 8]]\n\nTP: [30 14 11 8]\nFN: [21 30 43 43]\nFP: [55 31 29 22]\nTN: [ 94 125 117 127]\n\nTPR: [0.58823529 0.31818182 0.2037037 0.15686275]\nTNR: [0.63087248 0.80128205 0.80136986 0.85234899]\nPPV: [0.35294118 0.31111111 0.275 0.26666667]\nNPV: [0.8173913 0.80645161 0.73125 0.74705882]\nFPR: [0.36912752 0.19871795 0.19863014 0.14765101]\nFNR: [0.41176471 0.68181818 0.7962963 0.84313725]\nFDR: [0.64705882 0.68888889 0.725 0.73333333]\nACC: [0.62 0.695 0.64 0.675]\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "TP.sum()",
"execution_count": 34,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 34,
"data": {
"text/plain": "63"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "np.diag(cm)",
"execution_count": 35,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 35,
"data": {
"text/plain": "array([30, 14, 11, 8])"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "plot_confusion_matrix(neigh, X_test, y_test);",
"execution_count": 36,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 432x288 with 2 Axes>",
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Which is the optimal value of K?\nDevelop a loop wherein you can evaluate the accuracy of your KNN classifier starting from K = 1 until K = 10. You are required to calculate the model accuracy at every step as before."
},
{
"metadata": {},
"cell_type": "markdown",
"source": "**Its been a while since I have used dict, so I try it out here, just to keep things fresh.**"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "k_n: int = 10\nscore: dict = {}\nerror_rate: list = []\nfor n in range(1, k_n+1):\n neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)\n y_pred = neigh.predict(X_test)\n # score.update({n : accuracy_score(y_test, y_pred, normalize=True)})\n score.update({n : np.mean(y_pred == y_test)})\n error_rate.append(np.mean(y_pred != y_test)) # aka the inverse",
"execution_count": 37,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "error_rate",
"execution_count": 38,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 38,
"data": {
"text/plain": "[0.7, 0.71, 0.685, 0.68, 0.685, 0.69, 0.665, 0.675, 0.66, 0.67]"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "score",
"execution_count": 39,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 39,
"data": {
"text/plain": "{1: 0.3,\n 2: 0.29,\n 3: 0.315,\n 4: 0.32,\n 5: 0.315,\n 6: 0.31,\n 7: 0.335,\n 8: 0.325,\n 9: 0.34,\n 10: 0.33}"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "for i in score.items():\n print(i)",
"execution_count": 40,
"outputs": [
{
"output_type": "stream",
"text": "(1, 0.3)\n(2, 0.29)\n(3, 0.315)\n(4, 0.32)\n(5, 0.315)\n(6, 0.31)\n(7, 0.335)\n(8, 0.325)\n(9, 0.34)\n(10, 0.33)\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "for i in score:\n print(i, \":\", score[i])",
"execution_count": 41,
"outputs": [
{
"output_type": "stream",
"text": "1 : 0.3\n2 : 0.29\n3 : 0.315\n4 : 0.32\n5 : 0.315\n6 : 0.31\n7 : 0.335\n8 : 0.325\n9 : 0.34\n10 : 0.33\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "score = pd.DataFrame.from_dict(score, orient=\"index\")",
"execution_count": 42,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "score.rename(columns = {0:\"Score\"}, inplace = True)",
"execution_count": 43,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "score",
"execution_count": 44,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 44,
"data": {
"text/plain": " Score\n1 0.300\n2 0.290\n3 0.315\n4 0.320\n5 0.315\n6 0.310\n7 0.335\n8 0.325\n9 0.340\n10 0.330",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>0.300</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.290</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.315</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.320</td>\n </tr>\n <tr>\n <th>5</th>\n <td>0.315</td>\n </tr>\n <tr>\n <th>6</th>\n <td>0.310</td>\n </tr>\n <tr>\n <th>7</th>\n <td>0.335</td>\n </tr>\n <tr>\n <th>8</th>\n <td>0.325</td>\n </tr>\n <tr>\n <th>9</th>\n <td>0.340</td>\n </tr>\n <tr>\n <th>10</th>\n <td>0.330</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Visualisation of model accuracy\nPlot the model accuracy for the different values of K. Purpose of this task is to visualise the comparison among the various neighbours, as performed in the above task."
},
{
"metadata": {},
"cell_type": "markdown",
"source": "How great they want you to believe knn is:"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "ax = score.plot(kind=\"line\", figsize=(10,6), xticks=range(1,score.index.max()+1));",
"execution_count": 45,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 720x432 with 1 Axes>",
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "How good it actually is:"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "ax = score.plot(kind=\"line\", figsize=(10,6), xticks=range(1,score.index.max()+1))\nplt.yticks(np.arange(0, 1.1, .1));",
"execution_count": 46,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 720x432 with 1 Axes>",
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"metadata": {
"scrolled": false,
"trusted": true
},
"cell_type": "code",
"source": "plt.figure(figsize=(10, 6))\nplt.plot(range(len(error_rate)), error_rate, color=\"b\", linestyle=\"dashed\", marker=\"o\", markerfacecolor=\"red\", markersize=10)\nplt.xticks(np.arange(0,10,1));",
"execution_count": 47,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 720x432 with 1 Axes>",
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"_draft": {
"nbviewer_url": "https://gist.github.com/e7f8ad2a436bb47560319097f101407a"
},
"gist": {
"id": "e7f8ad2a436bb47560319097f101407a",
"data": {
"description": "nuc_machine_learning/Session 4/tutorial_4.ipynb",
"public": true
}
},
"hide_input": false,
"kernelspec": {
"name": "venv",
"display_name": "venv",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.8.3",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"latex_envs": {
"eqNumInitial": 1,
"eqLabelWithNumbers": true,
"current_citInitial": 1,
"cite_by": "apalike",
"bibliofile": "biblio.bib",
"LaTeX_envs_menu_present": true,
"labels_anchors": false,
"latex_user_defs": false,
"user_envs_cfg": false,
"report_style_numbering": false,
"autoclose": false,
"autocomplete": true,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
}
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": true,
"base_numbering": 1,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"window_display": false,
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"library": "var_list.py",
"delete_cmd_prefix": "del ",
"delete_cmd_postfix": "",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"library": "var_list.r",
"delete_cmd_prefix": "rm(",
"delete_cmd_postfix": ") ",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
]
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment