cindithompson/AmazonBaselines.ipynb Secret

## AmazonBaselines.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Python and data setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib import rcParams\n",
    "from skimage import io\n",
    "import sklearn\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "rcParams['font.size'] = 14\n",
    "rcParams['lines.linewidth'] = 2\n",
    "rcParams['figure.figsize'] = (10, 6)\n",
    "rcParams['axes.titlepad'] = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "PLANET_KAGGLE_ROOT = os.path.abspath(\"input/\")\n",
    "PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')\n",
    "PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_v2.csv')\n",
    "assert os.path.exists(PLANET_KAGGLE_ROOT)\n",
    "assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)\n",
    "assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)\n",
    "# Build list with unique labels\n",
    "label_list = set()\n",
    "for tag_str in labels_df.tags.values:\n",
    "    labels = tag_str.split()\n",
    "    for label in labels:\n",
    "        label_list.add(label)\n",
    "# Add one hot features (new columns in the dataframe) for every label\n",
    "for label in label_list:\n",
    "    labels_df[label] = labels_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_list = list(label_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split up the training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_exs = len(labels_df)\n",
    "ntrain = int(num_exs * .6)\n",
    "nval = int((num_exs-ntrain)/2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24287\n",
      "8096\n"
     ]
    }
   ],
   "source": [
    "print(ntrain)\n",
    "print(nval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data is already shuffled by the contest organizers, no need to randomize\n",
    "train_data = labels_df[:ntrain]\n",
    "validation_data = labels_df[ntrain:ntrain+nval]\n",
    "test_data = labels_df[ntrain+nval:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Baseline model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_true = validation_data[label_list] # get the actual labels for the validation set\n",
    "# set up an empty prediction vector, length the number of classes\n",
    "preds = np.zeros(len(label_list))\n",
    "preds = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# just for fun, I'm also showing score for random vector, and a vector of all 1's\n",
    "rand_preds = np.random.choice([0,1], size=(len(validation_data),len(label_list)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import fbeta_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.33572727002015307"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fbeta_score(valid_true, rand_preds, beta=2, average='samples')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "one_preds = np.ones(len(label_list))\n",
    "one_preds = pd.DataFrame([one_preds]*len(validation_data),columns=label_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.48279449258650137"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fbeta_score(valid_true, one_preds, beta=2, average='samples')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_true = validation_data[label_list] # get the actual labels for the validation set\n",
    "# set up an empty prediction vector, length the number of classes\n",
    "preds = np.zeros(len(label_list))\n",
    "# set the primary label to positive\n",
    "preds[label_list.index('primary')] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now populate a prediction matrix, every example just with the primary label\n",
    "valid_baseline_pred = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we ought to do better saying clear & primary:\n",
    "preds[label_list.index('clear')] = 1\n",
    "valid_baseline_pred_2 = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluation Metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import fbeta_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4015255862310777"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fbeta_score(valid_true, valid_baseline_pred, beta=2, average='samples')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6433861442943634"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fbeta_score(valid_true, valid_baseline_pred_2, beta=2, average='samples')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A Smarter Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import cv2\n",
    "\n",
    "rescaled_dim = 32"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Started from https://www.kaggle.com/syedosman/logistic-regression-classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The below code is a bit complex! It reads the jpg training images, rescales them down, and reshapes each to a flat vector for input to Sklearn, which requires numpy arrays!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = np.squeeze(np.array([cv2.resize(io.imread(os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg', name+'.jpg')),\n",
    "                                          (rescaled_dim, rescaled_dim), cv2.INTER_LINEAR).reshape(1, -1)\n",
    "                               for name in train_data['image_name'].values]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>conventional_mine</th>\n",
       "      <th>primary</th>\n",
       "      <th>habitation</th>\n",
       "      <th>water</th>\n",
       "      <th>road</th>\n",
       "      <th>blooming</th>\n",
       "      <th>blow_down</th>\n",
       "      <th>partly_cloudy</th>\n",
       "      <th>selective_logging</th>\n",
       "      <th>haze</th>\n",
       "      <th>clear</th>\n",
       "      <th>slash_burn</th>\n",
       "      <th>bare_ground</th>\n",
       "      <th>artisinal_mine</th>\n",
       "      <th>cultivation</th>\n",
       "      <th>agriculture</th>\n",
       "      <th>cloudy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   conventional_mine  primary  habitation  water  road  blooming  blow_down  \\\n",
       "0                  0        1           0      0     0         0          0   \n",
       "1                  0        1           0      1     0         0          0   \n",
       "2                  0        1           0      0     0         0          0   \n",
       "3                  0        1           0      0     0         0          0   \n",
       "4                  0        1           1      0     1         0          0   \n",
       "\n",
       "   partly_cloudy  selective_logging  haze  clear  slash_burn  bare_ground  \\\n",
       "0              0                  0     1      0           0            0   \n",
       "1              0                  0     0      1           0            0   \n",
       "2              0                  0     0      1           0            0   \n",
       "3              0                  0     0      1           0            0   \n",
       "4              0                  0     0      1           0            0   \n",
       "\n",
       "   artisinal_mine  cultivation  agriculture  cloudy  \n",
       "0               0            0            0       0  \n",
       "1               0            0            1       0  \n",
       "2               0            0            0       0  \n",
       "3               0            0            0       0  \n",
       "4               0            0            1       0  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# now get the correct labels for the training data\n",
    "y_train = train_data[label_list]\n",
    "y_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# and the validation set\n",
    "X_valid = np.squeeze(np.array([cv2.resize(io.imread(os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg', name+'.jpg')),\n",
    "                                          (rescaled_dim, rescaled_dim), cv2.INTER_LINEAR).reshape(1, -1)\n",
    "                               for name in validation_data['image_name'].values]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_valid = validation_data[label_list]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Recall the discussion of normalization in the data preparation post"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "allXs = np.concatenate((X_train, X_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/cynthiathompson/miniconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.\n",
      "  warnings.warn(msg, DataConversionWarning)\n"
     ]
    }
   ],
   "source": [
    "allXs = sklearn.preprocessing.scale(allXs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24287 8096\n"
     ]
    }
   ],
   "source": [
    "X_train = allXs[:ntrain]\n",
    "X_valid = allXs[ntrain:]\n",
    "print(len(X_train), len(X_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "          intercept_scaling=1, max_iter=100, multi_class='warn',\n",
       "          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',\n",
       "          tol=0.0001, verbose=0, warm_start=False),\n",
       "          n_jobs=None)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = OneVsRestClassifier(LogisticRegression(solver='liblinear'))\n",
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/cynthiathompson/miniconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no predicted labels.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.6755013691546999"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_test = clf.predict(X_valid)\n",
    "fbeta_score(y_valid, pred_test, beta=2, average='samples')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Python and data setup"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"from matplotlib import rcParams\n",
	"from skimage import io\n",
	"import sklearn\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"rcParams['font.size'] = 14\n",
	"rcParams['lines.linewidth'] = 2\n",
	"rcParams['figure.figsize'] = (10, 6)\n",
	"rcParams['axes.titlepad'] = 20"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"PLANET_KAGGLE_ROOT = os.path.abspath(\"input/\")\n",
	"PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')\n",
	"PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_v2.csv')\n",
	"assert os.path.exists(PLANET_KAGGLE_ROOT)\n",
	"assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)\n",
	"assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)\n",
	"# Build list with unique labels\n",
	"label_list = set()\n",
	"for tag_str in labels_df.tags.values:\n",
	" labels = tag_str.split()\n",
	" for label in labels:\n",
	" label_list.add(label)\n",
	"# Add one hot features (new columns in the dataframe) for every label\n",
	"for label in label_list:\n",
	" labels_df[label] = labels_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"label_list = list(label_list)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Split up the training data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"num_exs = len(labels_df)\n",
	"ntrain = int(num_exs * .6)\n",
	"nval = int((num_exs-ntrain)/2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"24287\n",
	"8096\n"
	]
	}
	],
	"source": [
	"print(ntrain)\n",
	"print(nval)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# data is already shuffled by the contest organizers, no need to randomize\n",
	"train_data = labels_df[:ntrain]\n",
	"validation_data = labels_df[ntrain:ntrain+nval]\n",
	"test_data = labels_df[ntrain+nval:]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Baseline model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"valid_true = validation_data[label_list] # get the actual labels for the validation set\n",
	"# set up an empty prediction vector, length the number of classes\n",
	"preds = np.zeros(len(label_list))\n",
	"preds = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"# just for fun, I'm also showing score for random vector, and a vector of all 1's\n",
	"rand_preds = np.random.choice([0,1], size=(len(validation_data),len(label_list)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.metrics import fbeta_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.33572727002015307"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"fbeta_score(valid_true, rand_preds, beta=2, average='samples')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"one_preds = np.ones(len(label_list))\n",
	"one_preds = pd.DataFrame([one_preds]*len(validation_data),columns=label_list)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.48279449258650137"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"fbeta_score(valid_true, one_preds, beta=2, average='samples')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"valid_true = validation_data[label_list] # get the actual labels for the validation set\n",
	"# set up an empty prediction vector, length the number of classes\n",
	"preds = np.zeros(len(label_list))\n",
	"# set the primary label to positive\n",
	"preds[label_list.index('primary')] = 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"# now populate a prediction matrix, every example just with the primary label\n",
	"valid_baseline_pred = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"# we ought to do better saying clear & primary:\n",
	"preds[label_list.index('clear')] = 1\n",
	"valid_baseline_pred_2 = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Evaluation Metric"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.metrics import fbeta_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.4015255862310777"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"fbeta_score(valid_true, valid_baseline_pred, beta=2, average='samples')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.6433861442943634"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"fbeta_score(valid_true, valid_baseline_pred_2, beta=2, average='samples')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### A Smarter Model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.multiclass import OneVsRestClassifier\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"import cv2\n",
	"\n",
	"rescaled_dim = 32"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Started from https://www.kaggle.com/syedosman/logistic-regression-classification"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The below code is a bit complex! It reads the jpg training images, rescales them down, and reshapes each to a flat vector for input to Sklearn, which requires numpy arrays!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [],
	"source": [
	"X_train = np.squeeze(np.array([cv2.resize(io.imread(os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg', name+'.jpg')),\n",
	" (rescaled_dim, rescaled_dim), cv2.INTER_LINEAR).reshape(1, -1)\n",
	" for name in train_data['image_name'].values]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>conventional_mine</th>\n",
	" <th>primary</th>\n",
	" <th>habitation</th>\n",
	" <th>water</th>\n",
	" <th>road</th>\n",
	" <th>blooming</th>\n",
	" <th>blow_down</th>\n",
	" <th>partly_cloudy</th>\n",
	" <th>selective_logging</th>\n",
	" <th>haze</th>\n",
	" <th>clear</th>\n",
	" <th>slash_burn</th>\n",
	" <th>bare_ground</th>\n",
	" <th>artisinal_mine</th>\n",
	" <th>cultivation</th>\n",
	" <th>agriculture</th>\n",
	" <th>cloudy</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" conventional_mine primary habitation water road blooming blow_down \\\n",
	"0 0 1 0 0 0 0 0 \n",
	"1 0 1 0 1 0 0 0 \n",
	"2 0 1 0 0 0 0 0 \n",
	"3 0 1 0 0 0 0 0 \n",
	"4 0 1 1 0 1 0 0 \n",
	"\n",
	" partly_cloudy selective_logging haze clear slash_burn bare_ground \\\n",
	"0 0 0 1 0 0 0 \n",
	"1 0 0 0 1 0 0 \n",
	"2 0 0 0 1 0 0 \n",
	"3 0 0 0 1 0 0 \n",
	"4 0 0 0 1 0 0 \n",
	"\n",
	" artisinal_mine cultivation agriculture cloudy \n",
	"0 0 0 0 0 \n",
	"1 0 0 1 0 \n",
	"2 0 0 0 0 \n",
	"3 0 0 0 0 \n",
	"4 0 0 1 0 "
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# now get the correct labels for the training data\n",
	"y_train = train_data[label_list]\n",
	"y_train.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [],
	"source": [
	"# and the validation set\n",
	"X_valid = np.squeeze(np.array([cv2.resize(io.imread(os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg', name+'.jpg')),\n",
	" (rescaled_dim, rescaled_dim), cv2.INTER_LINEAR).reshape(1, -1)\n",
	" for name in validation_data['image_name'].values]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"y_valid = validation_data[label_list]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Recall the discussion of normalization in the data preparation post"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [],
	"source": [
	"allXs = np.concatenate((X_train, X_valid))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Users/cynthiathompson/miniconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.\n",
	" warnings.warn(msg, DataConversionWarning)\n"
	]
	}
	],
	"source": [
	"allXs = sklearn.preprocessing.scale(allXs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"24287 8096\n"
	]
	}
	],
	"source": [
	"X_train = allXs[:ntrain]\n",
	"X_valid = allXs[ntrain:]\n",
	"print(len(X_train), len(X_valid))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.multiclass import OneVsRestClassifier\n",
	"from sklearn.linear_model import LogisticRegression"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
	" intercept_scaling=1, max_iter=100, multi_class='warn',\n",
	" n_jobs=None, penalty='l2', random_state=None, solver='liblinear',\n",
	" tol=0.0001, verbose=0, warm_start=False),\n",
	" n_jobs=None)"
	]
	},
	"execution_count": 34,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf = OneVsRestClassifier(LogisticRegression(solver='liblinear'))\n",
	"clf.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Users/cynthiathompson/miniconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no predicted labels.\n",
	" 'precision', 'predicted', average, warn_for)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"0.6755013691546999"
	]
	},
	"execution_count": 36,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pred_test = clf.predict(X_valid)\n",
	"fbeta_score(y_valid, pred_test, beta=2, average='samples')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}