aegorenkov/class_weight_recall_precision_example.ipynb

## class_weight_recall_precision_example.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import datasets, cross_validation, linear_model, metrics\n",
    "import pandas as pd\n",
    "\n",
    "%matplotlib inline\n",
    "# Load in pre-packaged data from sklearn and convert to Pandas DF\n",
    "iris = datasets.load_iris()\n",
    "\n",
    "# Convert feature data to DataFrame\n",
    "irisdf = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
    "\n",
    "# Convert outcome data to DataFrame\n",
    "iris_outcome = pd.DataFrame(iris.target, columns=['species'])\n",
    "\n",
    "# We need to put the outcome and data together if we want to explore it\n",
    "# They have matching indicies, so let's join them by index\n",
    "irisdf = irisdf.join(iris_outcome)\n",
    "\n",
    "# The variable names are annoying, let's change them\n",
    "irisdf.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create a Logistic Regression model to predict the species type\n",
    "features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n",
    "X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n",
    "y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n",
    "X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n",
    "logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:1}).fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = logit.predict(X_test)\n",
    "y_pred_proba = logit.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[22,  0,  0],\n",
       "       [ 0, 21,  8],\n",
       "       [ 0,  0, 24]])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics.confusion_matrix(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[53,  0],\n",
       "       [ 0, 22]])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "replacement = {0:1, 1:0, 2:0}\n",
    "metrics.confusion_matrix(y_test.replace(replacement), \n",
    "                         pd.Series(y_pred).replace(replacement))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[46,  0],\n",
       "       [ 8, 21]])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "replacement = {0:0, 1:1, 2:0}\n",
    "metrics.confusion_matrix(y_test.replace(replacement), \n",
    "                         pd.Series(y_pred).replace(replacement))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[43,  8],\n",
       "       [ 0, 24]])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "replacement = {0:0, 1:0, 2:1}\n",
    "metrics.confusion_matrix(y_test.replace(replacement), \n",
    "                         pd.Series(y_pred).replace(replacement))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.893333333333\n"
     ]
    }
   ],
   "source": [
    "print metrics.accuracy_score(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       1.00      1.00      1.00        22\n",
      "          1       1.00      0.72      0.84        29\n",
      "          2       0.75      1.00      0.86        24\n",
      "\n",
      "avg / total       0.92      0.89      0.89        75\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# The simple way however is to let Sklearn summarize what we are interested in\n",
    "print metrics.classification_report(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**The precision on class 2 is weaksauce and recall is maxed out. Let's downsample class 2.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a Logistic Regression model to predict the species type\n",
    "features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n",
    "X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n",
    "y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n",
    "X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n",
    "logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:.4}).fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = logit.predict(X_test)\n",
    "y_pred_proba = logit.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.946666666667\n"
     ]
    }
   ],
   "source": [
    "print metrics.accuracy_score(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       1.00      1.00      1.00        22\n",
      "          1       0.96      0.90      0.93        29\n",
      "          2       0.88      0.96      0.92        24\n",
      "\n",
      "avg / total       0.95      0.95      0.95        75\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# The simple way however is to let Sklearn summarize what we are interested in\n",
    "print metrics.classification_report(y_test, y_pred)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn import datasets, cross_validation, linear_model, metrics\n",
	"import pandas as pd\n",
	"\n",
	"%matplotlib inline\n",
	"# Load in pre-packaged data from sklearn and convert to Pandas DF\n",
	"iris = datasets.load_iris()\n",
	"\n",
	"# Convert feature data to DataFrame\n",
	"irisdf = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
	"\n",
	"# Convert outcome data to DataFrame\n",
	"iris_outcome = pd.DataFrame(iris.target, columns=['species'])\n",
	"\n",
	"# We need to put the outcome and data together if we want to explore it\n",
	"# They have matching indicies, so let's join them by index\n",
	"irisdf = irisdf.join(iris_outcome)\n",
	"\n",
	"# The variable names are annoying, let's change them\n",
	"irisdf.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Create a Logistic Regression model to predict the species type\n",
	"features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n",
	"X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n",
	"y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n",
	"X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n",
	"logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:1}).fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"y_pred = logit.predict(X_test)\n",
	"y_pred_proba = logit.predict_proba(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[22, 0, 0],\n",
	" [ 0, 21, 8],\n",
	" [ 0, 0, 24]])"
	]
	},
	"execution_count": 28,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"metrics.confusion_matrix(y_test, y_pred)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[53, 0],\n",
	" [ 0, 22]])"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"replacement = {0:1, 1:0, 2:0}\n",
	"metrics.confusion_matrix(y_test.replace(replacement), \n",
	" pd.Series(y_pred).replace(replacement))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[46, 0],\n",
	" [ 8, 21]])"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"replacement = {0:0, 1:1, 2:0}\n",
	"metrics.confusion_matrix(y_test.replace(replacement), \n",
	" pd.Series(y_pred).replace(replacement))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[43, 8],\n",
	" [ 0, 24]])"
	]
	},
	"execution_count": 31,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"replacement = {0:0, 1:0, 2:1}\n",
	"metrics.confusion_matrix(y_test.replace(replacement), \n",
	" pd.Series(y_pred).replace(replacement))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.893333333333\n"
	]
	}
	],
	"source": [
	"print metrics.accuracy_score(y_test, y_pred)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 1.00 1.00 1.00 22\n",
	" 1 1.00 0.72 0.84 29\n",
	" 2 0.75 1.00 0.86 24\n",
	"\n",
	"avg / total 0.92 0.89 0.89 75\n",
	"\n"
	]
	}
	],
	"source": [
	"# The simple way however is to let Sklearn summarize what we are interested in\n",
	"print metrics.classification_report(y_test, y_pred)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The precision on class 2 is weaksauce and recall is maxed out. Let's downsample class 2."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Create a Logistic Regression model to predict the species type\n",
	"features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n",
	"X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n",
	"y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n",
	"X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n",
	"logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:.4}).fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"y_pred = logit.predict(X_test)\n",
	"y_pred_proba = logit.predict_proba(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.946666666667\n"
	]
	}
	],
	"source": [
	"print metrics.accuracy_score(y_test, y_pred)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 1.00 1.00 1.00 22\n",
	" 1 0.96 0.90 0.93 29\n",
	" 2 0.88 0.96 0.92 24\n",
	"\n",
	"avg / total 0.95 0.95 0.95 75\n",
	"\n"
	]
	}
	],
	"source": [
	"# The simple way however is to let Sklearn summarize what we are interested in\n",
	"print metrics.classification_report(y_test, y_pred)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [Root]",
	"language": "python",
	"name": "Python [Root]"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}