Skip to content

Instantly share code, notes, and snippets.

@georgehc
Created November 20, 2019 02:36
Show Gist options
  • Save georgehc/7eff5acc0c5f8de144bb0d3b049ea781 to your computer and use it in GitHub Desktop.
Save georgehc/7eff5acc0c5f8de144bb0d3b049ea781 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 94-775/95-865: Prediction and Model Validation\n",
"\n",
"Author: George H. Chen (georgechen [at symbol] cmu.edu)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"# the lines below are just for aesthetics\n",
"plt.style.use('ggplot') # if you want your plot to look at ggplot (like how R makes plots)\n",
"%config InlineBackend.figure_format = 'retina' # if you use a Mac with Retina display"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data preparation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"from tensorflow.python import keras\n",
"from keras.datasets import mnist\n",
"\n",
"(train_images, train_labels), (test_images, test_labels) = mnist.load_data()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_images = train_images[:2000]\n",
"train_labels = train_labels[:2000]\n",
"test_images = test_images[:500]\n",
"test_labels = test_labels[:500]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"flattened_train_images = train_images.reshape(len(train_images), -1) # flattens out each training image\n",
"flattened_test_images = test_images.reshape(len(test_images), -1) # flattens out each test image"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"flattened_train_images = flattened_train_images.astype(np.float32) / 255 # rescale to be between 0 and 1\n",
"flattened_test_images = flattened_test_images.astype(np.float32) / 255 # rescale to be between 0 and 1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2000, 784)\n"
]
}
],
"source": [
"print(flattened_train_images.shape)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(500, 784)\n"
]
}
],
"source": [
"print(flattened_test_images.shape)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.image.AxesImage at 0x11a8dd150>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"image/png": {
"height": 248,
"width": 251
},
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# we could plot out what training images look like\n",
"plt.imshow(train_images[1].reshape(28, 28), cmap='gray')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Classification using $k$-nearest neighbors"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=None, n_neighbors=1, p=2,\n",
" weights='uniform')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"classifier = KNeighborsClassifier(n_neighbors=1)\n",
"classifier.fit(flattened_train_images, train_labels)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"predicted_train_labels = classifier.predict(flattened_train_images)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([5, 0, 4, ..., 5, 2, 0], dtype=uint8)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predicted_train_labels"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0\n"
]
}
],
"source": [
"error_rate = np.mean(predicted_train_labels != train_labels)\n",
"print(error_rate)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Choosing hyperparameter $k$ using simple data splitting"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(0)\n",
"num_train_images = len(flattened_train_images)\n",
"shuffled_indices = np.random.permutation(num_train_images)\n",
"\n",
"train_frac = 0.7\n",
"num_actual_training_examples = int(train_frac*num_train_images)\n",
"smaller_train_indices = shuffled_indices[:num_actual_training_examples]\n",
"validation_indices = shuffled_indices[num_actual_training_examples:]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"k: 1 error rate: 0.08666666666666667\n",
"k: 2 error rate: 0.105\n",
"k: 3 error rate: 0.085\n",
"k: 4 error rate: 0.095\n",
"k: 5 error rate: 0.08666666666666667\n",
"k: 6 error rate: 0.09666666666666666\n",
"k: 7 error rate: 0.095\n",
"k: 8 error rate: 0.09833333333333333\n",
"k: 9 error rate: 0.1\n",
"k: 10 error rate: 0.105\n",
"k: 11 error rate: 0.10666666666666667\n",
"k: 12 error rate: 0.10333333333333333\n",
"k: 13 error rate: 0.115\n",
"k: 14 error rate: 0.11166666666666666\n",
"k: 15 error rate: 0.10833333333333334\n",
"k: 16 error rate: 0.11\n",
"k: 17 error rate: 0.10833333333333334\n",
"k: 18 error rate: 0.115\n",
"k: 19 error rate: 0.11333333333333333\n",
"k: 20 error rate: 0.11666666666666667\n",
"k: 21 error rate: 0.11333333333333333\n",
"k: 22 error rate: 0.11833333333333333\n",
"k: 23 error rate: 0.12\n",
"k: 24 error rate: 0.12\n",
"k: 25 error rate: 0.12166666666666667\n",
"k: 26 error rate: 0.12\n",
"k: 27 error rate: 0.12\n",
"k: 28 error rate: 0.12166666666666667\n",
"k: 29 error rate: 0.12666666666666668\n",
"k: 30 error rate: 0.12666666666666668\n",
"k: 31 error rate: 0.12666666666666668\n",
"k: 32 error rate: 0.13\n",
"k: 33 error rate: 0.13\n",
"k: 34 error rate: 0.12666666666666668\n",
"k: 35 error rate: 0.13\n",
"k: 36 error rate: 0.13166666666666665\n",
"k: 37 error rate: 0.13\n",
"k: 38 error rate: 0.13166666666666665\n",
"k: 39 error rate: 0.12666666666666668\n",
"k: 40 error rate: 0.13333333333333333\n",
"k: 41 error rate: 0.13666666666666666\n",
"k: 42 error rate: 0.14\n",
"k: 43 error rate: 0.14166666666666666\n",
"k: 44 error rate: 0.14333333333333334\n",
"k: 45 error rate: 0.145\n",
"k: 46 error rate: 0.14666666666666667\n",
"k: 47 error rate: 0.14166666666666666\n",
"k: 48 error rate: 0.14166666666666666\n",
"k: 49 error rate: 0.14666666666666667\n",
"k: 50 error rate: 0.14333333333333334\n",
"Best k: 3 error rate: 0.085\n"
]
}
],
"source": [
"lowest_error = np.inf\n",
"best_k = None\n",
"for k in range(1, 51):\n",
" classifier = KNeighborsClassifier(n_neighbors=k)\n",
" classifier.fit(flattened_train_images[smaller_train_indices],\n",
" train_labels[smaller_train_indices])\n",
" predicted_val_labels = classifier.predict(flattened_train_images[validation_indices])\n",
" error = np.mean(predicted_val_labels != train_labels[validation_indices])\n",
" print('k:', k, 'error rate:', error)\n",
" \n",
" if error < lowest_error:\n",
" lowest_error = error\n",
" best_k = k\n",
"\n",
"print('Best k:', best_k, 'error rate:', lowest_error)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Choosing hyperparameter $k$ using 5-fold cross validation"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"k: 1 cross validation error: 0.092\n",
"k: 2 cross validation error: 0.10899999999999999\n",
"k: 3 cross validation error: 0.10099999999999998\n",
"k: 4 cross validation error: 0.10400000000000001\n",
"k: 5 cross validation error: 0.1025\n",
"k: 6 cross validation error: 0.10850000000000001\n",
"k: 7 cross validation error: 0.10500000000000001\n",
"k: 8 cross validation error: 0.1115\n",
"k: 9 cross validation error: 0.11199999999999999\n",
"k: 10 cross validation error: 0.1155\n",
"k: 11 cross validation error: 0.11750000000000001\n",
"k: 12 cross validation error: 0.11950000000000001\n",
"k: 13 cross validation error: 0.12300000000000003\n",
"k: 14 cross validation error: 0.12400000000000003\n",
"k: 15 cross validation error: 0.131\n",
"k: 16 cross validation error: 0.1335\n",
"k: 17 cross validation error: 0.133\n",
"k: 18 cross validation error: 0.13649999999999998\n",
"k: 19 cross validation error: 0.13899999999999998\n",
"k: 20 cross validation error: 0.1405\n",
"k: 21 cross validation error: 0.14350000000000002\n",
"k: 22 cross validation error: 0.14200000000000002\n",
"k: 23 cross validation error: 0.142\n",
"k: 24 cross validation error: 0.14100000000000001\n",
"k: 25 cross validation error: 0.14300000000000002\n",
"k: 26 cross validation error: 0.146\n",
"k: 27 cross validation error: 0.14850000000000002\n",
"k: 28 cross validation error: 0.14950000000000002\n",
"k: 29 cross validation error: 0.152\n",
"k: 30 cross validation error: 0.1545\n",
"k: 31 cross validation error: 0.15700000000000003\n",
"k: 32 cross validation error: 0.1595\n",
"k: 33 cross validation error: 0.159\n",
"k: 34 cross validation error: 0.15750000000000003\n",
"k: 35 cross validation error: 0.1605\n",
"k: 36 cross validation error: 0.1625\n",
"k: 37 cross validation error: 0.16399999999999998\n",
"k: 38 cross validation error: 0.1645\n",
"k: 39 cross validation error: 0.16450000000000004\n",
"k: 40 cross validation error: 0.1665\n",
"k: 41 cross validation error: 0.16999999999999998\n",
"k: 42 cross validation error: 0.1745\n",
"k: 43 cross validation error: 0.17650000000000002\n",
"k: 44 cross validation error: 0.175\n",
"k: 45 cross validation error: 0.17850000000000002\n",
"k: 46 cross validation error: 0.18\n",
"k: 47 cross validation error: 0.1805\n",
"k: 48 cross validation error: 0.1815\n",
"k: 49 cross validation error: 0.1825\n",
"k: 50 cross validation error: 0.1825\n",
"Best k: 1 cross validation error: 0.092\n"
]
}
],
"source": [
"from sklearn.model_selection import KFold\n",
"\n",
"lowest_cross_val_error = np.inf\n",
"best_k = None\n",
"\n",
"indices = range(num_train_images)\n",
"kf = KFold(n_splits=5, shuffle=True, random_state=0)\n",
"for k in range(1, 51):\n",
" errors = []\n",
" for train_indices, val_indices in kf.split(indices):\n",
" classifier = KNeighborsClassifier(n_neighbors=k)\n",
" classifier.fit(flattened_train_images[train_indices],\n",
" train_labels[train_indices])\n",
" predicted_val_labels = classifier.predict(flattened_train_images[val_indices])\n",
" error = np.mean(predicted_val_labels != train_labels[val_indices])\n",
" errors.append(error)\n",
" \n",
" cross_val_error = np.mean(errors)\n",
" print('k:', k, 'cross validation error:', cross_val_error)\n",
"\n",
" if cross_val_error < lowest_cross_val_error:\n",
" lowest_cross_val_error = cross_val_error\n",
" best_k = k\n",
"\n",
"print('Best k:', best_k, 'cross validation error:', lowest_cross_val_error)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using different classifiers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It's simple to work with other classifiers in scikit-learn. For example, here is how one can use random forest classifiers using the number of trees and the max depth as hyperparameters (there are other hyperparameters as well, but we're just using the scikit-learn default values in this demo--if you care about actually tuning the performance of a random forest classifier carefully, then you should look into what the other hyperparameters do by reading the documentation)."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=None, random_state=0)\n",
"rf_classifier.fit(flattened_train_images, train_labels)\n",
"rf_predicted_train_labels = rf_classifier.predict(flattened_train_images)\n",
"rf_error = np.mean(rf_predicted_train_labels != train_labels)\n",
"print(rf_error) # training set error"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we see cross-validation for random forests. Importantly, now we sweep over two hyperparameters."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hyperparameter: (50, 3) cross validation error: 0.2765\n",
"Hyperparameter: (50, 4) cross validation error: 0.20350000000000001\n",
"Hyperparameter: (50, 5) cross validation error: 0.155\n",
"Hyperparameter: (50, None) cross validation error: 0.0905\n",
"Hyperparameter: (100, 3) cross validation error: 0.2635\n",
"Hyperparameter: (100, 4) cross validation error: 0.19899999999999998\n",
"Hyperparameter: (100, 5) cross validation error: 0.1545\n",
"Hyperparameter: (100, None) cross validation error: 0.08549999999999999\n",
"Hyperparameter: (150, 3) cross validation error: 0.263\n",
"Hyperparameter: (150, 4) cross validation error: 0.1955\n",
"Hyperparameter: (150, 5) cross validation error: 0.15\n",
"Hyperparameter: (150, None) cross validation error: 0.08149999999999999\n",
"Hyperparameter: (200, 3) cross validation error: 0.2625\n",
"Hyperparameter: (200, 4) cross validation error: 0.1975\n",
"Hyperparameter: (200, 5) cross validation error: 0.149\n",
"Hyperparameter: (200, None) cross validation error: 0.0825\n",
"Best hyperparameter: (150, None) cross validation error: 0.08149999999999999\n"
]
}
],
"source": [
"lowest_cross_val_error = np.inf\n",
"best_hyperparameter_setting = None\n",
"\n",
"hyperparameter_settings = [(num_trees, max_depth)\n",
" for num_trees in [50, 100, 150, 200]\n",
" for max_depth in [3, 4, 5, None]]\n",
"\n",
"indices = range(num_train_images)\n",
"kf = KFold(n_splits=5, shuffle=True, random_state=0)\n",
"for hyperparameter_setting in hyperparameter_settings:\n",
" num_trees, max_depth = hyperparameter_setting\n",
" errors = []\n",
" for train_indices, val_indices in kf.split(indices):\n",
" classifier = RandomForestClassifier(n_estimators=num_trees,\n",
" max_depth=max_depth,\n",
" random_state=0)\n",
" classifier.fit(flattened_train_images[train_indices],\n",
" train_labels[train_indices])\n",
" predicted_val_labels = classifier.predict(flattened_train_images[val_indices])\n",
" error = np.mean(predicted_val_labels != train_labels[val_indices])\n",
" errors.append(error)\n",
" \n",
" cross_val_error = np.mean(errors)\n",
" print('Hyperparameter:', hyperparameter_setting, 'cross validation error:', cross_val_error)\n",
"\n",
" if cross_val_error < lowest_cross_val_error:\n",
" lowest_cross_val_error = cross_val_error\n",
" best_hyperparameter_setting = hyperparameter_setting\n",
"\n",
"print('Best hyperparameter:', best_hyperparameter_setting, 'cross validation error:', lowest_cross_val_error)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Finally actually looking at the test data"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.138\n"
]
}
],
"source": [
"final_knn_classifier = KNeighborsClassifier(n_neighbors=best_k)\n",
"final_knn_classifier.fit(flattened_train_images, train_labels)\n",
"predicted_test_labels = final_knn_classifier.predict(flattened_test_images)\n",
"test_set_error = np.mean(predicted_test_labels != test_labels)\n",
"print(test_set_error)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.082\n"
]
}
],
"source": [
"final_rf_classifier = RandomForestClassifier(n_estimators=best_hyperparameter_setting[0],\n",
" max_depth=best_hyperparameter_setting[1],\n",
" random_state=0)\n",
"final_rf_classifier.fit(flattened_train_images, train_labels)\n",
"predicted_test_labels = final_rf_classifier.predict(flattened_test_images)\n",
"test_set_error = np.mean(predicted_test_labels != test_labels)\n",
"print(test_set_error)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that in general the cross validation error is not going to perfectly match up with the test set error."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment