Skip to content

Instantly share code, notes, and snippets.

@ctivanovich
Last active May 9, 2018 01:14
Show Gist options
  • Save ctivanovich/061ede7481a9e8ae32148cf911ae4ded to your computer and use it in GitHub Desktop.
Save ctivanovich/061ede7481a9e8ae32148cf911ae4ded to your computer and use it in GitHub Desktop.
Predictive modeling of trading data from a cryptocurrency platform
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from IPython.core.display import HTML\n",
"HTML(\"\"\"\n",
"<style>\n",
".output_png {\n",
" display: table-cell;\n",
" text-align: center;\n",
" vertical-align: middle;\n",
"}\n",
"</style>\n",
"\"\"\")\n",
"\n",
"%matplotlib inline\n",
"\n",
"import csv\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.utils import resample\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.metrics import precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from xgboost import XGBClassifier\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.svm import SVC\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def dimensioncounter(fileobj):\n",
"\n",
" reader = csv.reader(fileobj)\n",
" for row in reader:\n",
" ncols = len(row)\n",
" break\n",
" nrows = sum(1 for row in reader) + 1 # + 1 for the row used to count columns\n",
" \n",
" return nrows, ncols\n",
"\n",
"# with open('Test_Full.csv') as f:\n",
"# print(dimensioncounter(f))\n",
"#1228 columns, 520561 rows\n",
"# with open('Training_Full.csv') as f: \n",
"# print(dimensioncounter(f))\n",
"#1229 columns, 1214641 rows, column 0 is labels column"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def sampler(name, m, n):\n",
" with open(name+'.csv', 'r') as csv_in, open(name+'_sample.csv', 'w') as csv_out:\n",
" reader = csv.reader(csv_in, delimiter=',')\n",
" writer = csv.writer(csv_out)\n",
" for i, row in enumerate(reader):\n",
" if i%m == 0:\n",
" writer.writerow(row)\n",
"# sampler(\"Training_Full\", 100, None)\n",
"# sampler(\"Test_Full\", 1, None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#Extracting labelled data\n",
"# training_chunker = pd.read_csv(\"Training_Full.csv\", chunksize = 10000)\n",
"# train_df = pd.concat([chunk[(chunk[\"I_VOLUME_BJ_CLASS\"] == 1) | (chunk[\"I_VOLUME_BJ_CLASS\"] == -1)] for chunk in training_chunker])\n",
"# train_df.to_csv(\"Training_subset.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"train_df = pd.read_csv(\"Training_subset.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.drop('Unnamed: 0', axis = 1, inplace=True)\n",
"train_df.iloc[:,0].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"X = train_df[train_df.columns[1:201]]\n",
"y = train_df[train_df.columns[0]]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Some out-of-the-box comparisons of ML classifiers"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.69441162060120476"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"knn_model = KNeighborsClassifier(3).fit(X_train, y_train)\n",
"pred = knn_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.70392252932530186"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xgb_model = XGBClassifier(learning_rate=0.03, objective='binary:logistic').fit(X_train, y_train)\n",
"pred = xgb_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7016456754186241"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ada_model = AdaBoostClassifier().fit(X_train, y_train)\n",
"pred = ada_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.66783871804478778"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"forest_model = RandomForestClassifier().fit(X_train, y_train)\n",
"pred = forest_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.57079862812346893"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"bayes = GaussianNB().fit(X_train, y_train)\n",
"bayes.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEVCAYAAADjHF5YAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAH3ZJREFUeJzt3XmYHXWd7/H3J519IUASIBASFolEJCyGHUKzXA3goANMYBhQGUfFDRy83ov43EnQcZThujDjzCjC5RIHhYiAwICAQkcCsiUhIYbtQjBsgSQkRAiQ7Xv/qDqk6fRy+vSpU3XO+byep55TZ6v6pJN8q/pbVb9SRGBmZo2vX94BzMysNlzwzcyahAu+mVmTcME3M2sSLvhmZk3CBd/MrEkUpuBLulLSK5IWlfHZoyTNk7RB0ikd3ttV0h2SlkhaLGl8dqnNzOpHYQo+cBXwkTI/+yfgk8A1nbw3C7gkIj4AHAy8Wp14Zmb1rTAFPyLmAqvbvyZpD0m3S3pY0hxJE9PPLouIxUB0+PwkoCUi7k4/ty4i3q7RH8HMrNAKU/C7cDnwpYg4CPga8B89fH4i8LqkX6Utn0skKfOUZmZ1oH/eAboiaRhwOPDLdkV7QA9f6w8cCewPPA/MBj5F0i4yM2tqhS34JL99rI6IA3vxnReARyPiTwCSbgIOwQXfzCz7lo6k5yQtlLRA0kM9fTydiIg/A0slndZuWZO7+E7Jw8C2kkalz48FllSe3syscSjr0TIlPQt8KCJW9/C5nwOtwCjgFWAGcDfwY2AsyW8j10bEP0qaAtwIbAu8DSyPiH3T5RwHfD9d7DzgsxGxsdp/LjOzelOLgr8UmBIRqzJdkZmZdasWZ+kEcFd6auVnarA+MzPrRC0O2h4RES9LGkNS+B9Pz7k3M7MayrzgR8TL6eMKSTeSXP36noIvybfdMjPrpYjo1XVGmbZ0JA2VNDydHwZ8GFjc2WcjolDTjBkzcs/gTI2Tqai5nKl+M1Ui6z38HYEb0z34/sA1EXFnxus0M7NOZFrwI2IpyVWvZmaWs6KPpZOb1tbWvCNsxZnKU8RMUMxczlSeImaqRObn4ZcVQooi5DAzqxeSiCIdtDUzs+JwwTczaxIu+GZmTcIF38ysSbjgm5k1CRd8M7Mm4YJvZtYkXPDNzJqEC76ZWZNwwTczaxIu+GZmTcIF38ysSbjgm5k1CRd8M7Mm4YJvZtYkXPDNzJpEYQr++vV5JzAza2yFKfjz5+edwMyssRWm4P/+93knMDNrbC74ZmZNojA3MR85Mli1Clpa8k5jZlZ8dX0T87FjYdGivFOYmTWuwhT8qVPd1jEzy1JhCv7RR7vgm5llqTA9/GXLggMPhFdfBfWqK2Vm1nzquoe/664wYgQ88UTeSczMGlNhCj64j29mliUXfDOzJlG4gj9nDhTgsIKZWcMpVMHfc0/YvBmWLs07iZlZ4ylUwZfc1jEzy0qhCj644JuZZaVwBd8XYJmZZaNwBX/SJFizBl58Me8kZmaNpSYFX1I/SfMl3dxjoH5w1FFw7721SGZm1jxqtYd/PrCk3A+7j29mVn2ZF3xJ44ATgSvK/Y4LvplZ9dViD/8HwNeAsi+n2m8/eP55WLkyu1BmZs2mf5YLl3QS8EpEPCqpFehyZLeZM2e+O9/a2srhh7cydy58/ONZJjQzqw9tbW20tbX1aRmZDo8s6Z+As4CNwBBgBHBDRHyiw+eiY47vfCcZKvkHP8gsnplZ3Src8MgRcVFEjI+IPYAzgLs7FvuuuI9vZlZdhTsPv2TKFHjySXj99byTmJk1hpoV/IiYExEnl/v5QYPg4IPh/vuzTGVm1jwKu4cPbuuYmVWTC76ZWZMozE3MO8uxbh3ssENyts7QoTkEMzMrqMKdpdNXQ4fC5Mnw4IN5JzEzq3+FLvjgto6ZWbXURcGfMyfvFGZm9a/QPXxIzsPfZRd47TUYOLDGwczMCqrhevgAI0fCxInwyCN5JzEzq2+FL/jgPr6ZWTXURcH3fW7NzPqu8D18gBUrYK+9YNUqaGmpYTAzs4JqyB4+wJgxyYHbhQvzTmJmVr/qouCD+/hmZn3lgm9m1iTqoocP8MILcMABybg66lXXysys8TRsDx9g3DjYZhtYsiTvJGZm9anHgi9pR0lXSro9ff4BSZ/OPtrW3NYxM6tcOXv4/xe4A9g5ff4U8JWsAnXHBd/MrHLlFPzRETEb2AwQERuBTZmm6kLpAqwCHHYwM6s75RT8NyWNAgJA0qFALrcW33335IDts8/msXYzs/rWv4zPXADcDOwp6T5gDHBapqm6IG1p6+y5Zx4JzMzqV497+BExHzgaOBz4HLBPRCzKOlhX3Mc3M6tMOWfpfBEYHhF/jIjFwHBJX8g+Wudc8M3MKlNOD/8zEbGm9CQiVgOfyS5S9yZNgrVrkwuxzMysfOUU/BZpy7WtklqA3O49JcFRR3kv38yst8op+L8BrpN0nKTjgF+kr+XGbR0zs97rcSwdSf1IDtYel750F3BFRFTtXPxyxtJpb/58OOssD7NgZs2rkrF06mbwtPY2bYJRo+Cpp2CHHTIMZmZWUJkMnibpCEl3SXpK0rOSlkrK9dKnlhY44giYOzfPFGZm9aWcC6+uBP4emEdOQyp0ptTHP+WUvJOYmdWHcgr+6xFxe+ZJemnqVPjiF/NOYWZWP8o5aPtdoAW4AXin9Hp6BW51QvSyhw+wfn3Sx3/hBRg5slpJzMzqQyU9/HL28A9JH6e0ey2AY3uzomobOBAOPhjuuw9OPDHPJGZm9aHHgh8Rx9QiSCVKfXwXfDOznpWzh4+kk4B9gMGl1yLim1mFKtfUqXDRRXmnMDOrD+Wclvlj4HTgy4CAvwImlLNwSYMkPShpgaTHJM3oU9oODjkEFi2CN9+s5lLNzBpTOUMrHB4RnwBWR8TFwGHAxHIWHhHvAMdExAHA/sAJkg6uOG0HQ4fC/vvDAw9Ua4lmZo2rnIL/Vvq4TtLOwAZgbLkriIh16ewgkhZSVS/tLd320MzMuldOwb9V0rbApcB84DmSAdTKIqmfpAXAcuCuiHi4kqBd8UBqZmbl6dVYOpIGAYMjotf3tJW0DXAT8KWIWNLhvV6fh1+ydi3svDOsWgWDBlW0CDOzulPV8/AlHRsRd0vaavCCdEU39GZFEbFW0j3ANGCrcS5nzpz57nxrayutra1lLXebbWDvveGRR5LxdczMGlFbWxttbW19WkaXe/iSLo6IGZKu6uTtiIi/7XHh0mhgQ0S8LmkIcAfw3Yi4rcPnKt7DB7jgAhgzBr7+9YoXYWZWV6o+PHI6Fv5pETG7wkD7AleTHCvoB1wXEd/u5HN9Kvg33QQ/+QncXrgRf8zMspHJePiSHomIKd1+qI/6WvBXroQ990z6+P3LupTMzKy+ZTIePvBbSf9d0q6Sti9NFWbMxOjRsOuu8OijeScxMyuucvaHT08f2w9GHMAe1Y9TudLpmVMy/V3EzKx+9biHHxG7dzIVqtiDz8c3M+tJWefhS/og8AHeO3jarKqF6GMPH+Cll2DffWHFCuhXTqPKzKyOZXVP2xnAv6bTMcA/AydXlDBDO+8M228PS7Y6w9/MzKC8g7anAccByyPiHGA/oJD3mHJbx8ysa2UNnhYRm4GN6fAIrwK7ZhurMi74ZmZdK6fgP5IOnvZTYB7JAGp/yDRVhUoFv4+HA8zMGlJvB0/bDdgmIhZVNUQVDtpCUujHj4d77oH3va8KwczMCiqrg7Y3SzpT0rCIeK7axb6apGQvf86cvJOYmRVPOS2d7wFHAkskXS/pNEmDe/pSXtzHNzPrXNktHUktwLHAZ4BpEbFN1UJUqaUD8PjjcOKJsHRpVRZnZlZIWY2lQzq08anAucBBJCNgFtLeeyc3NV+2LO8kZmbFUk4PfzbwOMne/Y+APSPiy1kHq1Spj3/vvXknMTMrlnL28K8kKfLnRsQ96Tn5heY+vpnZ1soZPO2OiNhUizDV4oJvZra1hhxmbN99YflyePXVvJOYmRVHQxb8lpbkhubu45uZbdHlDVAkHdjdFyNifvXjVE/pAqxTT807iZlZMXR3x6vvpY+DgSnAQkDAZOAR4LBso/XN1Klw7rl5pzAzK44uWzoRcUxEHAO8DBwYEVMi4kPAAcCLtQpYqQMPhGeegdWr805iZlYM5fTw3x8Rj5WeRMRiYFJ2kapj4EA45BC47768k5iZFUM5BX+RpCsktabTT4HCDqDW3tFH+/RMM7OScgr+OcAfgfPTaUn6WuH5fHwzsy3KvYn5EGB8RDyZSYgqDp7W3ltvwZgxyTn5w4dXffFmZrnJajz8k4FHgd+kz/eXdHNlEWtryBA44AB44IG8k5iZ5a+cls4M4GBgDUBEPArsnmWoanJbx8wsUU7B3xARr3d4rW7uGuuCb2aWKKfg/1HSmUCLpL0k/Stwf8a5qubww+GRR+Dtt/NOYmaWr3IK/peBfYB3gF8Aa4GvZBmqmkaMgEmT4OGH805iZpavsm9xmGmIjM7SKfnqV2H77eEb38hsFWZmNZXVWToTJV0u6U5Jd5emymPWni/AMjMrYw9f0kLgx8A84N0boUTEvKqFyHgP/7XXYLfdksf+3Q0XZ2ZWJyrZwy+n/G2MiP+oMFMhbL99UvAXLICDDso7jZlZPso5aHuLpC9IGitp+9KUebIq8+mZZtbsymnpLO3k5YiIPaoWIuOWDsDs2XDNNfDrX2e6GjOzmqikpZPpWTqSxgGzgB2BzcBPI+JfOvlc5gX/5Zdhn31g5Uro15A3djSzZlLVHr6kYyPibkmndPZ+RNxQxvI3AhdExKOShgPzJN0ZEU/0JmQ1jB0Lo0fD4sUweXKt125mlr/uDtoeDdwN/EUn7wXQY8GPiOXA8nT+DUmPA7sANS/4sKWP74JvZs2oZhdeSdoNaAM+GBFvdHgv85YOwKxZcOutST/fzKyeZdbDl3QSyfAKg0uvRcQ3exFsOEmx/1ZEbHXYtFYF/7nn4NBDk36+evVjMjMrlkzOw5f0Y2AocAxwBXAa8FAvQvUHrgd+1lmxL5k5c+a7862trbS2tpa7irLtthsMGgRPPw0TJ1Z98WZmmWlra6Otra1PyyjntMxFETG53eNw4PaIOKqsFUizgJURcUE3n6nJHj7A2WcnQy383d/VZHVmZpnIZCwd4K30cZ2knYENwNgyAx0B/A1wrKQFkuZLmtabgNXmC7DMrFmVM7TCrZK2BS4F5pOcoXNFOQuPiPuAlsrjVd/UqfDtb+edwsys9np1lo6kQcDgTu6A1bcQNWzpRMBOO8FDD8GECTVZpZlZ1VX7wqtOL7hqt6JyLrwqHCnZy7/3Xhd8M2su3bV0OrvgqqSsC6+KaupUmDMHzjor7yRmZrXTFHe86mjhQpg+HZ58smarNDOrqqzueDVK0r+kZ9jMk3SZpFGVx8zfBz8IK1bA8uV5JzEzq51yTsu8FlgBnEpy0dUK4LosQ2WtpQWOPDLp45uZNYtyCv7YiPhWRCxNp38kGe64rvl8fDNrNuUU/DslnSGpXzpNB+7IOljWXPDNrNmUM7TCn4FhbLmBeQvwZjofEbFNn0PU+KAtwIYNMGpUMqDa9nV3w0Yza3aZHLSNiBER0S8iBqRTv/S1EdUo9nkZMCAZOfO++/JOYmZWG+WcpfPpDs9bJM3ILlLtuK1jZs2knB7+cZJukzRW0geBB4ARGeeqidIFWGZmzaDcG6CcDvwbSe/+zHRQtOqFyKGHD/D220kff/lyGNEQmzAzaxZZXXi1F3A+8CvgT8DZkoZWFrFYBg+GD30I/vCHvJOYmWWvnJbOLcD/iojPkdzY/Gng4UxT1ZD7+GbWLMop+AdHxO8gOQczIr4H/GW2sWrn6KNd8M2sOXRZ8CX9D4CIWCvprzq8/aksQ9XSYYfB/PlJP9/MrJF1t4d/Rrv5r3d4L9fbFFbT8OGwzz7JDVHMzBpZdwVfXcx39ryuuY9vZs2gu4IfXcx39ryuueCbWTPo8jx8SZtIzrsXMARYV3qL5L62A6oWIqfz8EtWr05ud7hqVTLkgplZ0VX1PPyIaImIbdIxc/qn86XnDVUWt9sOdt89OXhrZtaoyjktsym4rWNmjc4FP+WCb2aNrilvYt6ZV16BvfeGlSuTWyCamRVZJmPpNIsdd0ymxYvzTmJmlg0X/Hbc1jGzRuaC344Lvpk1Mvfw21m2DA46KBkfXw11LbGZNRr38Pto/HgYMgSeeirvJGZm1eeC34Fve2hmjcoFvwP38c2sUbngd1Dawy/AIQUzs6pywe9gr71gwwZ47rm8k5iZVZcLfgcSnHUWnHsurF+fdxozs+rxaZmd2LgRTjkFRo6Eq6+Gft4smlnBFO60TElXSnpF0qIs11Nt/fvDtdfCM8/A1zve3NHMrE5lve96FfCRjNeRiaFD4ZZb4Ne/hssuyzuNmVnf9c9y4RExV9KELNeRpVGj4I474IgjYOxYmD4970RmZpXLtOA3ggkT4Lbb4PjjYcwYOOaYvBOZmVWmMAV/5syZ7863trbS2tqaW5aOJk+G666D00+Hu+6C/fbLO5GZNZu2tjba2tr6tIzMz9JJWzq3RMTkbj5TqLN0ujJ7NlxwAcydC7vtlncaM2tmlZylU4s9fKVT3Zs+PRlJc9o0uO++pMdvZlYvsj4t8+fA/cBEScsknZPl+mrhvPPgYx+Dj34U1q3LO42ZWfl84VUFNm+GT30K1qyBG25Izts3M6ulwl141aj69YMrr4R33oHPf94DrZlZfXDBr9CAAXD99bBgAVx8cd5pzMx65mZEH4wYAf/1X1suzPrc5/JOZGbWNRf8PtpxR/jNb5Jx9HfaKTmga2ZWRC74VfC+98HNN8MJJ8Do0ckev5lZ0biHXyVTpsDPfpYMq/z443mnMTPbmgt+FU2bBpdemjy++GLeaczM3sstnSr7xCfg5ZeTon/vvbDttnknMjNL+MKrDETAV74CCxcmB3QHD847kZk1mkouvHLBz8jmzXDGGUnxv/ZaaGnJO5GZNRJfaVsg/frBrFmwcmWyt99g2zMzq0Mu+BkaPBhuugnmzIFLLsk7jZk1Ox+0zdjIkXD77Vuuxv3kJ/NOZGbNygW/BnbZJSn6ra3JlbnTpuWdyMyakVs6NTJpEtx4I5x9Njz8cN5pzKwZueDX0OGHJ8Mqn3wyPP103mnMrNm4pVNjJ5+85TaJ99+ftHjMzGrBBT8Hn/0svPQSnHgitLUlwyybmWXNF17lJCIZP/+55+DWW2HgwLwTmVk98ZW2dWbjRjj11GQPf9as5GItM7Ny+ErbOtO/P/ziF/Dss3DhhXmnMbNG54Kfs6FD4ZZbkumHP8w7jZk1Mh+0LYBRo5JRNUtX455+et6JzKwRueAXxIQJcNttcPzxMGYMHHts3onMrNG4pVMgkyfD7NnJsMoLF+adxswajQt+wbS2wo9+BCedlJyyaWZWLW7pFND06Vuuxp07F0aPzjuRmTUCn4dfYBdemIyl/7vfJWfzmJmV+MKrBhORjJ+/enUy0mZ//z5mZilfeNVgpGR0zQ0b4NxzfZtEM+sb7+HXgTfeSA7mbtwIe+wB48ZtmXbdNXnceWcYNCjvpGZWK27pNLB16+Cxx+CFF5Lp+effO//yy7Dddls2AB03COPGJXfeGjw47z+JmVWDC34T27QJXn218w1C6flLLyX32G2/Qei4URg3DoYMyftPY2Y9ccG3bm3eDCtWdL1BeOEFePFFGD68+w3CuHEwbFjefxqz5uaCb30WAStXdr1BKE1DhiSFf6edklbStttueSxNnT33uP9m1VHIgi9pGvBDkjOCroyISzr5jAt+HYmAVauSwr98OaxZs2Vavbrr56tXw4AB3W8Quns+cqRPTTUrKVzBl9QPeAo4DngJeBg4IyKe6PC5whX8trY2Wltb847xHvWeKSI5+FzuBqLj87VrkwvQetpAvPRSG5MntzJwYLKBGTCAd+d781r//smpsXn8rGrFmcpTxEyVFPys95cOBp6OiD8BSLoW+BjwRLffKoAi/gXXeyYp6f0PG5acMdRbmzfDn//c/QZi6VKYM6eNF19sZcMGWL+e9zz25rWNG5OiX8nGorP3Fi5s49hjWxk6lK2mIUO2fq39NGBA739e5aj3f1O1UsRMlci64O8CPN/u+QskGwGzXuvXL2nrjByZDCfdlZkzk6mvNm9Oin4lG4uOr61fnwyGN2xY8lvOa68lj11Nb721Zf7NN5M83W0QuttgdPXekCHJQfzHH082xqXfZvKef/vt5Lc5Kfk77+6xmr+BNQN3RM260K9fsmderQPNy5fDN75R2Xc3bOh8Y9DdhmLduuQAfHfvL1+ejNVU6qhG1H6+4/O33oJ///dkgxvR9WPpe6UNQE8bh+4ee/rMqlVw3XVd//10t+HpaaPUl+/2VtY9/EOBmRExLX1+IRAdD9xKKlYD38ysDhTtoG0L8CTJQduXgYeAv46IxzNbqZmZdSrTlk5EbJL0JeBOtpyW6WJvZpaDQlx4ZWZm2ct1eGRJV0p6RdKiPHOUSBon6W5Jf5T0mKTz8s4EIGmQpAclLUhzzcg7EyTXWUiaL+nmvLOUSHpO0sL0Z/VQ3nkAJI2U9EtJj6f/tg7JOc/E9OczP318vQj/1iX9vaTFkhZJukZSIa7LlnR++v8ut5rQWa2UtJ2kOyU9KekOSSN7Wk7e4+FfBXwk5wztbQQuiIh9gMOAL0raO+dMRMQ7wDERcQCwP3CCpCKc3no+sCTvEB1sBloj4oCIKMLPCOAy4LaImATsB+Ta1oyIp9Kfz4HAh4A3gRvzzCRpZ+DLwIERMZmk3XxGnpkAJO0DfBqYQvJ/76OS9sghSme18kLgtxHxfuBu4Os9LSTXgh8Rc4HVeWZoLyKWR8Sj6fwbJP8xK7hEqPoiYl06O4jkP0OuvThJ44ATgSvyzNEJkf+OzLskbQMcFRFXAUTExohYm3Os9o4HnomI53v8ZPZagGGS+gNDSa7Oz9sk4MGIeCciNgG/B06pdYguauXHgKvT+auBj/e0nML8xygaSbuRbNEfzDdJIm2fLACWA3dFxMM5R/oB8DVy3vB0IoC7JD0s6TN5hwF2B1ZKuiptoVwuqUgDUJ8O/CLvEBHxEvA9YBnwIrAmIn6bbyoAFgNHpe2ToSQ7ObvmnKlkh4h4BZKdVWCHnr7ggt8JScOB64Hz0z393EXE5rSlMw44RNIH8soi6STglfS3IaVTURyRtipOJGnJHZlznv7AgcC/pbnWkfwqnjtJA4CTgV8WIMu2JHusE4CdgeGSzsw3FaTjfl0C3AXcBiwANuUaqms97ny54HeQ/jp5PfCziPh13nk6StsB9wDTcoxxBHCypGdJ9g6PkTQrxzzvioiX08cVJH3pvPv4LwDPR8Qj6fPrSTYARXACMC/9WeXteODZiHgtbZ3cAByecyYAIuKqiJgSEa3AGpIBIYvgFUk7AkjaCXi1py8UoeAXbQ/x/wBLIuKyvIOUSBpdOgKftgP+GzkOQBcRF0XE+IjYg+TA2t0R8Ym88pRIGpr+doakYcCHSX4lz036K/fzkiamLx1HcQ50/zUFaOeklgGHShosSSQ/p0JcsyNpTPo4HvhL4Od5ReG9tfJm4FPp/CeBHndQcx1LR9LPgVZglKRlwIzSwa2c8hwB/A3wWNovD+CiiPhNXplSY4Gr0+Gm+wHXRcRtOWcqoh2BG9OhOvoD10TEnTlnAjgPuCZtoTwLnJNzHtJ+9PHAZ/POAhARD0m6nqRlsiF9vDzfVO/6laTtSXJ9IY+D7p3VSuC7wC8l/S3wJ2B6j8vxhVdmZs2hCC0dMzOrARd8M7Mm4YJvZtYkXPDNzJqEC76ZWZNwwTczaxIu+FYIkjZLurTd869K+od0/mPtRy2VdI+kolytWjOSehwN0aw7LvhWFO8Ap6QXuHT0cWCfGucpoovyDmD1zQXfimIjyZWVF7R/UdJhJAN8/XM62mRpLPLp6U1hnkivkN6KpP+Z3kxjgaR/Sl/bX9IfJD0q6Vfthqy4R9L301E2/yhpSvr+k5K+lX5mQnoTk/+UtETSbEmD0/eOS/MtlHRFelUtkpZKmilpXvrexPT1oelNLR5I3/uL9PVPpuu9PV33d9PXvwMMSdfxs2r+4K2JRIQnT7lPwFpgOLAUGAF8FfiH9L2rgFPaffYe4NJ0/gSS4aI7Lm8aMBcYlD7fNn1cCByZzl8MfL/dMr+Tzp9HMkTvDsBA4HlgO5KRHDcDh6afu5JkAzWIZCyYPdPXrwbOS+eXklyOD/B54PJ0/tvAmen8SOBJYAjJmCj/L/1ZDAKeA3Yp/Yzy/nvyVN+T9/CtMCIZivpqkjtp9eSG9HEeSSHu6HjgqkjuFkZErElvRjIykptJkK5rarvvlG7V+BiwOCJejYj1wDNsGQN9WUQ8kM7/J3Ak8H6SkR6f6WK5pbtJzQN2S+c/DFyYjtnURrJhGZ++97uIeCPNvqSLP59Zr+U6eJpZJy4D5pPs1XfnnfRxE9X7d1xa5uZ285AMotfVOkqDUXU34mtnWQWcGhFPt/+gpEM7rLvjd8wq5j18KwoBRMRqYDbJfURL/gxs09N3O7gLOKd0dylJ20UyyuHqdj3/s4E5vcw5XltuQn4mcC9JO2ZCu+MLZ5PstXfnDpLWEWm+/ctY93pJLb2La7aFC74VRfthW78HjGr32rXA19KDm3uw9Z19thryNSLuIGnRPCJpPskxAUjGD//fkh4luaH4N7taRhfLf5LkTlpLgG2BH6etl3OA6yUtJNkr/0kPy/0WMCA9qLy4XY7u1n05ydDdPmhrFfHwyGZlkjQBuDUi9s07i1klvIdv1jveQ7K65T18M7Mm4T18M7Mm4YJvZtYkXPDNzJqEC76ZWZNwwTczaxIu+GZmTeL/A5VpI7vjSeyaAAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x1f8abcf0ba8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pca = PCA()\n",
"pca.fit_transform(X_train)\n",
"plt.plot(range(1, 11), pca.explained_variance_[:10])\n",
"plt.xlabel(\"Nth component\")\n",
"plt.ylabel(\"Explained variance\");"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"param_grid = [{\n",
"# 'pca__n_components':[7,8],\n",
" 'forest__criterion': ['entropy','gini'],\n",
" 'forest__min_samples_split' : [15], \n",
" 'forest__min_samples_leaf' : [50, 100],\n",
"# 'ada__n_estimators':[25, 50],\n",
"# 'xgb__learning_rate':[0.03, 0.3],\n",
"# 'xgb__gamma':[0.01, 0.1],\n",
"# 'xgb__objective':['binary:logistic']\n",
" \n",
"}]\n",
"\n",
"pipe = Pipeline(steps=\n",
" [('forest', RandomForestClassifier())],\n",
"# [('xgb', XGBClassifier())],\n",
" )\n",
"clf = GridSearchCV(pipe, param_grid)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.70251030348445109"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X_train,y_train)\n",
"clf.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.69840263691683568, 0.66445543360270176, 0.68100624266023857, None)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"precision_recall_fscore_support(y_test, clf.predict(X_test), average='binary')"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(Pipeline(memory=None,\n",
" steps=[('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=100, min_samples_split=15,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False))]), array([-1, 1], dtype=int64))"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.best_estimator_, clf.classes_"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"test_data = pd.read_csv(\"Test_Full.csv\", chunksize = 1000)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"predictions = []\n",
"for chunk in test_data:\n",
" predictions.append(clf.predict_proba(chunk.iloc[:,0:200]))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(520560, 2)"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pred_df = pd.concat([pd.DataFrame(pred) for pred in predictions])\n",
"pred_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.4845007791752725"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pred_df[1].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"with open('predictions.csv', 'w') as outfile:\n",
" writer = csv.writer(outfile)\n",
" writer.writerow(['Label'])\n",
" for chunk in predictions:\n",
" for row in chunk:\n",
" writer.writerow([row[1]])"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"probs = pd.read_csv('predictions.csv')\n",
"full_test = pd.read_csv('Test_Full.csv')\n",
"full_test['BUY_PROBA'] = probs"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"full_test.to_csv('labelled_test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 518560.000000\n",
"mean 0.491775\n",
"std 0.211203\n",
"min 0.038531\n",
"25% 0.322517\n",
"50% 0.493593\n",
"75% 0.658734\n",
"max 0.975740\n",
"Name: prob_buy, dtype: float64"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"full_test.prob_buy.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment