Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save aegorenkov/b865aa9e77110255c58c4720d56b6ba1 to your computer and use it in GitHub Desktop.
Save aegorenkov/b865aa9e77110255c58c4720d56b6ba1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn import datasets, cross_validation, linear_model, metrics\n",
"import pandas as pd\n",
"\n",
"%matplotlib inline\n",
"# Load in pre-packaged data from sklearn and convert to Pandas DF\n",
"iris = datasets.load_iris()\n",
"\n",
"# Convert feature data to DataFrame\n",
"irisdf = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
"\n",
"# Convert outcome data to DataFrame\n",
"iris_outcome = pd.DataFrame(iris.target, columns=['species'])\n",
"\n",
"# We need to put the outcome and data together if we want to explore it\n",
"# They have matching indicies, so let's join them by index\n",
"irisdf = irisdf.join(iris_outcome)\n",
"\n",
"# The variable names are annoying, let's change them\n",
"irisdf.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Create a Logistic Regression model to predict the species type\n",
"features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n",
"X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n",
"y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n",
"X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n",
"logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:1}).fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred = logit.predict(X_test)\n",
"y_pred_proba = logit.predict_proba(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[22, 0, 0],\n",
" [ 0, 21, 8],\n",
" [ 0, 0, 24]])"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrics.confusion_matrix(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[53, 0],\n",
" [ 0, 22]])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"replacement = {0:1, 1:0, 2:0}\n",
"metrics.confusion_matrix(y_test.replace(replacement), \n",
" pd.Series(y_pred).replace(replacement))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[46, 0],\n",
" [ 8, 21]])"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"replacement = {0:0, 1:1, 2:0}\n",
"metrics.confusion_matrix(y_test.replace(replacement), \n",
" pd.Series(y_pred).replace(replacement))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[43, 8],\n",
" [ 0, 24]])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"replacement = {0:0, 1:0, 2:1}\n",
"metrics.confusion_matrix(y_test.replace(replacement), \n",
" pd.Series(y_pred).replace(replacement))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.893333333333\n"
]
}
],
"source": [
"print metrics.accuracy_score(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 22\n",
" 1 1.00 0.72 0.84 29\n",
" 2 0.75 1.00 0.86 24\n",
"\n",
"avg / total 0.92 0.89 0.89 75\n",
"\n"
]
}
],
"source": [
"# The simple way however is to let Sklearn summarize what we are interested in\n",
"print metrics.classification_report(y_test, y_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**The precision on class 2 is weaksauce and recall is maxed out. Let's downsample class 2.**"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Create a Logistic Regression model to predict the species type\n",
"features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n",
"X = irisdf[irisdf.species.isin([0, 1, 2])][features]\n",
"y = irisdf[irisdf.species.isin([0, 1, 2])]['species']\n",
"X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=.5, random_state=5)\n",
"logit = linear_model.LogisticRegression(multi_class='ovr', class_weight={0:1, 1:1, 2:.4}).fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred = logit.predict(X_test)\n",
"y_pred_proba = logit.predict_proba(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.946666666667\n"
]
}
],
"source": [
"print metrics.accuracy_score(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 22\n",
" 1 0.96 0.90 0.93 29\n",
" 2 0.88 0.96 0.92 24\n",
"\n",
"avg / total 0.95 0.95 0.95 75\n",
"\n"
]
}
],
"source": [
"# The simple way however is to let Sklearn summarize what we are interested in\n",
"print metrics.classification_report(y_test, y_pred)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment