Skip to content

Instantly share code, notes, and snippets.

@cindithompson
Created February 18, 2019 01:30
Show Gist options
  • Save cindithompson/46ad7fd2cc11dbb489f956c7b0ccdb10 to your computer and use it in GitHub Desktop.
Save cindithompson/46ad7fd2cc11dbb489f956c7b0ccdb10 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Python and data setup"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import rcParams\n",
"from skimage import io\n",
"import sklearn\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"rcParams['font.size'] = 14\n",
"rcParams['lines.linewidth'] = 2\n",
"rcParams['figure.figsize'] = (10, 6)\n",
"rcParams['axes.titlepad'] = 20"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"PLANET_KAGGLE_ROOT = os.path.abspath(\"input/\")\n",
"PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')\n",
"PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_v2.csv')\n",
"assert os.path.exists(PLANET_KAGGLE_ROOT)\n",
"assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)\n",
"assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)\n",
"# Build list with unique labels\n",
"label_list = set()\n",
"for tag_str in labels_df.tags.values:\n",
" labels = tag_str.split()\n",
" for label in labels:\n",
" label_list.add(label)\n",
"# Add one hot features (new columns in the dataframe) for every label\n",
"for label in label_list:\n",
" labels_df[label] = labels_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"label_list = list(label_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Split up the training data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"num_exs = len(labels_df)\n",
"ntrain = int(num_exs * .6)\n",
"nval = int((num_exs-ntrain)/2)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24287\n",
"8096\n"
]
}
],
"source": [
"print(ntrain)\n",
"print(nval)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# data is already shuffled by the contest organizers, no need to randomize\n",
"train_data = labels_df[:ntrain]\n",
"validation_data = labels_df[ntrain:ntrain+nval]\n",
"test_data = labels_df[ntrain+nval:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Baseline model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"valid_true = validation_data[label_list] # get the actual labels for the validation set\n",
"# set up an empty prediction vector, length the number of classes\n",
"preds = np.zeros(len(label_list))\n",
"preds = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# just for fun, I'm also showing score for random vector, and a vector of all 1's\n",
"rand_preds = np.random.choice([0,1], size=(len(validation_data),len(label_list)))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import fbeta_score"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.33572727002015307"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fbeta_score(valid_true, rand_preds, beta=2, average='samples')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"one_preds = np.ones(len(label_list))\n",
"one_preds = pd.DataFrame([one_preds]*len(validation_data),columns=label_list)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.48279449258650137"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fbeta_score(valid_true, one_preds, beta=2, average='samples')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"valid_true = validation_data[label_list] # get the actual labels for the validation set\n",
"# set up an empty prediction vector, length the number of classes\n",
"preds = np.zeros(len(label_list))\n",
"# set the primary label to positive\n",
"preds[label_list.index('primary')] = 1"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# now populate a prediction matrix, every example just with the primary label\n",
"valid_baseline_pred = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# we ought to do better saying clear & primary:\n",
"preds[label_list.index('clear')] = 1\n",
"valid_baseline_pred_2 = pd.DataFrame([preds]*len(validation_data),columns=label_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Evaluation Metric"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import fbeta_score"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.4015255862310777"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fbeta_score(valid_true, valid_baseline_pred, beta=2, average='samples')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6433861442943634"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fbeta_score(valid_true, valid_baseline_pred_2, beta=2, average='samples')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### A Smarter Model"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.multiclass import OneVsRestClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"import cv2\n",
"\n",
"rescaled_dim = 32"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Started from https://www.kaggle.com/syedosman/logistic-regression-classification"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The below code is a bit complex! It reads the jpg training images, rescales them down, and reshapes each to a flat vector for input to Sklearn, which requires numpy arrays!"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"X_train = np.squeeze(np.array([cv2.resize(io.imread(os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg', name+'.jpg')),\n",
" (rescaled_dim, rescaled_dim), cv2.INTER_LINEAR).reshape(1, -1)\n",
" for name in train_data['image_name'].values]))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>conventional_mine</th>\n",
" <th>primary</th>\n",
" <th>habitation</th>\n",
" <th>water</th>\n",
" <th>road</th>\n",
" <th>blooming</th>\n",
" <th>blow_down</th>\n",
" <th>partly_cloudy</th>\n",
" <th>selective_logging</th>\n",
" <th>haze</th>\n",
" <th>clear</th>\n",
" <th>slash_burn</th>\n",
" <th>bare_ground</th>\n",
" <th>artisinal_mine</th>\n",
" <th>cultivation</th>\n",
" <th>agriculture</th>\n",
" <th>cloudy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" conventional_mine primary habitation water road blooming blow_down \\\n",
"0 0 1 0 0 0 0 0 \n",
"1 0 1 0 1 0 0 0 \n",
"2 0 1 0 0 0 0 0 \n",
"3 0 1 0 0 0 0 0 \n",
"4 0 1 1 0 1 0 0 \n",
"\n",
" partly_cloudy selective_logging haze clear slash_burn bare_ground \\\n",
"0 0 0 1 0 0 0 \n",
"1 0 0 0 1 0 0 \n",
"2 0 0 0 1 0 0 \n",
"3 0 0 0 1 0 0 \n",
"4 0 0 0 1 0 0 \n",
"\n",
" artisinal_mine cultivation agriculture cloudy \n",
"0 0 0 0 0 \n",
"1 0 0 1 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 1 0 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# now get the correct labels for the training data\n",
"y_train = train_data[label_list]\n",
"y_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# and the validation set\n",
"X_valid = np.squeeze(np.array([cv2.resize(io.imread(os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg', name+'.jpg')),\n",
" (rescaled_dim, rescaled_dim), cv2.INTER_LINEAR).reshape(1, -1)\n",
" for name in validation_data['image_name'].values]))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"y_valid = validation_data[label_list]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Recall the discussion of normalization in the data preparation post"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"allXs = np.concatenate((X_train, X_valid))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/cynthiathompson/miniconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.\n",
" warnings.warn(msg, DataConversionWarning)\n"
]
}
],
"source": [
"allXs = sklearn.preprocessing.scale(allXs)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24287 8096\n"
]
}
],
"source": [
"X_train = allXs[:ntrain]\n",
"X_valid = allXs[ntrain:]\n",
"print(len(X_train), len(X_valid))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.multiclass import OneVsRestClassifier\n",
"from sklearn.linear_model import LogisticRegression"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, max_iter=100, multi_class='warn',\n",
" n_jobs=None, penalty='l2', random_state=None, solver='liblinear',\n",
" tol=0.0001, verbose=0, warm_start=False),\n",
" n_jobs=None)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf = OneVsRestClassifier(LogisticRegression(solver='liblinear'))\n",
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/cynthiathompson/miniconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no predicted labels.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
},
{
"data": {
"text/plain": [
"0.6755013691546999"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pred_test = clf.predict(X_valid)\n",
"fbeta_score(y_valid, pred_test, beta=2, average='samples')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment