Skip to content

Instantly share code, notes, and snippets.

@mikeliao97
Last active July 9, 2019 12:06
Show Gist options
  • Save mikeliao97/1a0b2bccb1b05ddff6a40f8ae3bd93bb to your computer and use it in GitHub Desktop.
Save mikeliao97/1a0b2bccb1b05ddff6a40f8ae3bd93bb to your computer and use it in GitHub Desktop.
knn
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"import math\n",
"from sklearn.cross_validation import train_test_split, KFold\n",
"\n",
"\n",
"%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv(\"./data.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>close_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>88042.000000</td>\n",
" <td>88042.000000</td>\n",
" <td>8.804200e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>34.559745</td>\n",
" <td>-93.470194</td>\n",
" <td>2.663684e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>13.983900</td>\n",
" <td>31.194818</td>\n",
" <td>1.543381e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-89.938989</td>\n",
" <td>-179.891580</td>\n",
" <td>-9.995300e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>36.200312</td>\n",
" <td>-98.481499</td>\n",
" <td>1.233064e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>36.385215</td>\n",
" <td>-98.165490</td>\n",
" <td>1.816143e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>36.544719</td>\n",
" <td>-97.967260</td>\n",
" <td>2.841568e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>89.984841</td>\n",
" <td>179.969440</td>\n",
" <td>1.009998e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" latitude longitude close_price\n",
"count 88042.000000 88042.000000 8.804200e+04\n",
"mean 34.559745 -93.470194 2.663684e+06\n",
"std 13.983900 31.194818 1.543381e+07\n",
"min -89.938989 -179.891580 -9.995300e+04\n",
"25% 36.200312 -98.481499 1.233064e+05\n",
"50% 36.385215 -98.165490 1.816143e+05\n",
"75% 36.544719 -97.967260 2.841568e+05\n",
"max 89.984841 179.969440 1.009998e+08"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"latitude float64\n",
"longitude float64\n",
"close_date object\n",
"close_price float64\n",
"dtype: object"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Exploration"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Turn closedate into date time"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"train['close_date'] = pd.to_datetime(train['close_date'])"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" latitude longitude close_date close_price date_ord\n",
"0 1.501986 86.350685 2014-08-16 22:25:31.925431 1.302246e+06 228\n",
"1 36.367095 -98.664280 2014-08-05 06:34:00.165876 1.475045e+05 217\n",
"2 36.599284 -97.924700 2014-08-12 23:48:00.887510 1.374006e+05 224\n",
"4 36.647982 -97.866100 2014-08-09 04:00:40.358242 2.391053e+05 221\n",
"5 36.525885 -98.333570 2014-08-07 15:18:18.456538 2.708852e+05 219\n"
]
}
],
"source": [
"mindate = train['close_date'].min().toordinal()\n",
"train['date_ord'] = train['close_date'].dt.date.apply(lambda x: x.toordinal() - mindate)\n",
"print(train.head())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>close_date</th>\n",
" <th>close_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.501986</td>\n",
" <td>86.350685</td>\n",
" <td>2014-08-16 22:25:31.925431</td>\n",
" <td>1.302246e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>36.367095</td>\n",
" <td>-98.664280</td>\n",
" <td>2014-08-05 06:34:00.165876</td>\n",
" <td>1.475045e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>36.599284</td>\n",
" <td>-97.924700</td>\n",
" <td>2014-08-12 23:48:00.887510</td>\n",
" <td>1.374006e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>67.994791</td>\n",
" <td>64.688589</td>\n",
" <td>2014-08-17 05:27:01.404296</td>\n",
" <td>-1.411200e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>36.647982</td>\n",
" <td>-97.866100</td>\n",
" <td>2014-08-09 04:00:40.358242</td>\n",
" <td>2.391053e+05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" latitude longitude close_date close_price\n",
"0 1.501986 86.350685 2014-08-16 22:25:31.925431 1.302246e+06\n",
"1 36.367095 -98.664280 2014-08-05 06:34:00.165876 1.475045e+05\n",
"2 36.599284 -97.924700 2014-08-12 23:48:00.887510 1.374006e+05\n",
"3 67.994791 64.688589 2014-08-17 05:27:01.404296 -1.411200e+04\n",
"4 36.647982 -97.866100 2014-08-09 04:00:40.358242 2.391053e+05"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"train = train[train['close_price'] > 0] ##drop smaller than 0"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"85868"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(train)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" mu = 2732369.60 and sigma = 15621763.96\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/envs/fastai-cpu/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6448: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n",
" warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n"
]
},
{
"data": {
"text/plain": [
"Text(0.5,1,'close_price distribution')"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"image/png": {
"height": 277,
"width": 377
}
},
"output_type": "display_data"
}
],
"source": [
"sns.distplot(train['close_price'] , fit=norm);\n",
"\n",
"# Get the fitted parameters used by the function\n",
"(mu, sigma) = norm.fit(train['close_price'])\n",
"print( '\\n mu = {:.2f} and sigma = {:.2f}\\n'.format(mu, sigma))\n",
"\n",
"#Now plot the distribution\n",
"plt.legend(['Normal dist. ($\\mu=$ {:.2f} and $\\sigma=$ {:.2f} )'.format(mu, sigma)],\n",
" loc='best')\n",
"plt.ylabel('Frequency')\n",
"plt.title('close_price distribution')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Feature Normalization"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import operator"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [],
"source": [
"class KNNRegressor:\n",
" #KNN is a lazy algorithm that \n",
" '''\n",
" train: The Pandas Dataframe object passed to KNN\n",
" \n",
" k: The integer value for the number of neighbors needed to taken into\n",
" account\n",
" \n",
" weights: either Uniform or Distance. How much weight each neighbor\n",
" gives to the instance\n",
" \n",
" '''\n",
" def __init__(self, train, k, weights='distances'):\n",
" self.train = train\n",
" self.k = k\n",
" self.predictions = None\n",
" self.weights = weights\n",
" \n",
" \n",
" '''\n",
" Params:\n",
" test: A test dataframe\n",
" \n",
" return: a set of predictions based on the model\n",
" '''\n",
" def predict(self, test):\n",
" self.predictions = []\n",
" for x in range(len(test)):\n",
" x_i = test.iloc[x]\n",
" pred = self.predict_one(x_i);\n",
" self.predictions.append(pred)\n",
" return self.predictions\n",
" \n",
" '''\n",
" params:\n",
" y: A single row in the test dataframe\n",
" \n",
" return: prediction of one row \n",
" \n",
" '''\n",
" def predict_one(self, y):\n",
" neighbors = self.get_neighbors(y) #get the neighbors \n",
" preds = 0\n",
" if (self.weights == 'uniform'):\n",
" for x in range(len(neighbors)):\n",
" preds += neighbors[x][0][-1] #Add the neighbor's preidiction\n",
" return preds / self.k #return the mean prediction\n",
" \n",
" if (self.weights == 'distances'):\n",
" b_weights = self.get_weights(neighbors)\n",
" for x in range(len(neighbors)):\n",
" preds += b_weights[x] * neighbors[x][0][-1]\n",
" return preds \n",
" '''\n",
" parmas:\n",
" y: A single row in the test dataframe\n",
" \n",
" return: neighbors for the given row:\n",
" neighors is an arrays of tuples ((neighbor), distance)\n",
" '''\n",
" def get_neighbors(self, y):\n",
" distances = []\n",
" for x in range(len(self.train)): #calculate the disatnces for each self.train\n",
" #Disregard any time leakage data\n",
" x_i = train.iloc[x]\n",
" if (x_i['date_ord'] < y['date_ord']): #only add dates before the y date \n",
" dist = self.get_distance(train.iloc[x], y)\n",
" distances.append((x_i, dist))\n",
" distances.sort(key=operator.itemgetter(1))\n",
" \n",
" neighbors = []\n",
" for x in range(self.k):\n",
" if (len(distances) > x):\n",
" neighbors.append(distances[x]) \n",
" return neighbors\n",
" \n",
" #euclidean distance\n",
" def get_distance(self, x, y):\n",
" d1 = y['latitude'] - x['latitude']\n",
" d2 = y['longitude'] - x['longitude']\n",
" return math.sqrt(math.pow(d1, 2) + math.pow(d2, 2))\n",
" \n",
" '''\n",
" params:\n",
" neighbors: A list of tuples ((neighbor), distance)\n",
" \n",
" return: neighbors for the given row:\n",
" neighors is an arrays of tuples ((neighbor), distance)\n",
" '''\n",
" def get_weights(self, neighbors):\n",
" weights = []\n",
" total_distance = 0\n",
" for tuple in neighbors:\n",
" total_distance += tuple[1]\n",
" \n",
" for tuple in neighbors:\n",
" weights.append(tuple[1] / total_distance)\n",
" return weights\n",
" \n",
" def getMRAE(self, y_pred, y_true):\n",
" return np.median(np.abs(y_pred - y_true) / y_true)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {},
"outputs": [],
"source": [
"# 1) Compute the distance matrix between all training and test examples\n",
"# 2) Sort the distance matrix and take the top 4 columns and sum. \n",
"\n",
"class fastKNN:\n",
" def __init__(self, train, k, weights='distances'):\n",
" self.Xtrain = train.drop(['close_price', 'close_date'], axis=1).values \n",
" self.Ytrain = train['close_price'].values\n",
" self.k = k\n",
" self.predictions = None\n",
" self.weights = weights\n",
" \n",
" \n",
" def predict(self, Y):\n",
" Y = Y.values #get the numpy array\n",
" num_test = Y.shape[0] #get the number of test examples\n",
" num_train = self.Xtrain.shape[0] #get the number of train examples\n",
" \n",
" dists = self.calc_distances(Y)\n",
" num_test = dists.shape[0]\n",
" y_pred = np.zeros(num_test)\n",
" \n",
" for i in range(num_test):\n",
" dists_i = dists[i]\n",
" \n",
" closest_y = self.Ytrain[dists[i].argsort()[:self.k]]\n",
" y_pred[i] = closest_y.mean()\n",
" return y_pred\n",
" \n",
" def calc_distances(self, Y): \n",
" #Initialize an array of distances between i_th test, and j_th train\n",
" dists = np.zeros((Y.shape[0], self.Xtrain.shape[0]))\n",
" \n",
" #Calculate the Distances \n",
" #Intuitively: Distance = sqrt( (X - Y)^2) = sqrt((X^2 + Y^2 - 2XY))\n",
"\n",
" #square the train's values \n",
" train_2 = self.Xtrain * self.Xtrain\n",
" train_2 = np.sum(train_2, axis = 1)\n",
" \n",
" #make the shape compatible with Y \n",
" train_2_repeat = np.array([train_2] * Y.shape[0])\n",
" \n",
" Y_2 = Y * Y\n",
" Y_2 = np.sum(Y_2, axis = 1)\n",
" Y_2_repeat = np.array([Y_2] * self.Xtrain.shape[0]).transpose()\n",
" \n",
" Y_dot_Xtrain = Y.dot(self.Xtrain.T)\n",
" \n",
" #(x^2 + y^2 - 2xy)\n",
" dists = train_2_repeat + Y_2_repeat - Y_dot_Xtrain\n",
" return dists\n",
" \n",
" @staticmethod\n",
" def getMRAE(y_pred, y_true):\n",
" return np.median(np.abs(y_pred - y_true) / y_true)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sort the Train Values so they're ordered in time\n",
"### Train on the data that is older. \n",
"### Test on the data that's more recent to prevent time leakage."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample Test"
]
},
{
"cell_type": "code",
"execution_count": 208,
"metadata": {},
"outputs": [],
"source": [
"samp_size = 500\n",
"sample = train[:samp_size]\n",
"\n",
"val_split = int(samp_size * 0.25)\n",
"def train_val_split(df, n): return df[n:], df[:n]"
]
},
{
"cell_type": "code",
"execution_count": 209,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train size 375\n",
"val size 125\n"
]
}
],
"source": [
"train_s, val = train_val_split(sample, val_split)\n",
"print(\"train size\", len(train_s))\n",
"print(\"val size\", len(val))\n",
"\n",
"\n",
"val_ft = val.drop(['close_price', 'close_date'], axis=1)\n",
"val_gt = val['close_price']"
]
},
{
"cell_type": "code",
"execution_count": 210,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>date_ord</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.501986</td>\n",
" <td>86.350685</td>\n",
" <td>228</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>36.367095</td>\n",
" <td>-98.664280</td>\n",
" <td>217</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>36.599284</td>\n",
" <td>-97.924700</td>\n",
" <td>224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>36.647982</td>\n",
" <td>-97.866100</td>\n",
" <td>221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>36.525885</td>\n",
" <td>-98.333570</td>\n",
" <td>219</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" latitude longitude date_ord\n",
"0 1.501986 86.350685 228\n",
"1 36.367095 -98.664280 217\n",
"2 36.599284 -97.924700 224\n",
"4 36.647982 -97.866100 221\n",
"5 36.525885 -98.333570 219"
]
},
"execution_count": 210,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val_ft.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### FastKNN: 3.3ms, error: 0.54"
]
},
{
"cell_type": "code",
"execution_count": 211,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.3 ms ± 396 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"fastknn = fastKNN(train_s, 4, weights='uniform')\n",
"%timeit preds_fast = fastknn.predict(val_ft)\n",
"preds_fast = fastknn.predict(val_ft)"
]
},
{
"cell_type": "code",
"execution_count": 212,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.545161183041706"
]
},
"execution_count": 212,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fastKNN.getMRAE(preds_fast, val_gt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SKLearnKNNRegressor: 600us, error: 0.59 "
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=1, n_neighbors=4, p=2,\n",
" weights='uniform')"
]
},
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sk_train = train_s.drop(['close_price', 'close_date'], axis=1).values \n",
"sk_trainy = train_s['close_price'].values\n",
"\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"neigh = KNeighborsRegressor(n_neighbors=4)\n",
"neigh.fit(sk_train, sk_trainy)"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"600 µs ± 58.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
},
{
"data": {
"text/plain": [
"0.5914010079839274"
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%timeit sk_preds = neigh.predict(val_ft)\n",
"sk_preds = neigh.predict(val_ft)\n",
"fastKNN.getMRAE(sk_preds, val_gt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Original Implementation: 11.1s, error: 0.99"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12.1 s ± 684 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"knn_uniform = KNNRegressor(train_s, 4, weights='uniform')\n",
"%timeit preds = knn_uniform.predict(val_ft)\n",
"preds = knn_uniform.predict(val_ft)"
]
},
{
"cell_type": "code",
"execution_count": 221,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9990856865693482"
]
},
"execution_count": 221,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fastKNN.getMRAE(preds, val_gt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SKLEARN 10 times faster than > FastKNN > Original Implementaion"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test on All"
]
},
{
"cell_type": "code",
"execution_count": 229,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train size 80868\n",
"val size 5000\n"
]
}
],
"source": [
"train_s, val = train_val_split(train, 5000)\n",
"print(\"train size\", len(train_s))\n",
"print(\"val size\", len(val))\n",
"\n",
"\n",
"val_ft = val.drop(['close_price', 'close_date'], axis=1)\n",
"val_gt = val['close_price']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sklearn Test"
]
},
{
"cell_type": "code",
"execution_count": 234,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=1, n_neighbors=4, p=2,\n",
" weights='uniform')"
]
},
"execution_count": 234,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sk_train = train_s.drop(['close_price', 'close_date'], axis=1).values \n",
"sk_trainy = train_s['close_price'].values\n",
"\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"neigh = KNeighborsRegressor(n_neighbors=4)\n",
"neigh.fit(sk_train, sk_trainy)"
]
},
{
"cell_type": "code",
"execution_count": 233,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.41419507220253743"
]
},
"execution_count": 233,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sk_preds = neigh.predict(val_ft)\n",
"fastKNN.getMRAE(sk_preds, val_gt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Implementation Test"
]
},
{
"cell_type": "code",
"execution_count": 236,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.42151694663154804"
]
},
"execution_count": 236,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fastknn = fastKNN(train_s, 4, weights='uniform')\n",
"preds_fast = fastknn.predict(val_ft)\n",
"fastKNN.getMRAE(preds_fast, val_gt)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment