infinite-Joy/inserting_pca_to_production.ipynb

## inserting_pca_to_production.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Open</th>\n",
       "      <th>High</th>\n",
       "      <th>Low</th>\n",
       "      <th>Last</th>\n",
       "      <th>Close</th>\n",
       "      <th>Total Trade Quantity</th>\n",
       "      <th>Turnover (Lacs)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2017-08-02</td>\n",
       "      <td>209.10</td>\n",
       "      <td>209.95</td>\n",
       "      <td>204.1</td>\n",
       "      <td>205.80</td>\n",
       "      <td>206.70</td>\n",
       "      <td>2935.0</td>\n",
       "      <td>6.09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2017-08-01</td>\n",
       "      <td>212.00</td>\n",
       "      <td>214.95</td>\n",
       "      <td>208.3</td>\n",
       "      <td>208.30</td>\n",
       "      <td>209.15</td>\n",
       "      <td>5094.0</td>\n",
       "      <td>10.78</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2017-07-31</td>\n",
       "      <td>212.00</td>\n",
       "      <td>215.00</td>\n",
       "      <td>210.2</td>\n",
       "      <td>210.35</td>\n",
       "      <td>212.00</td>\n",
       "      <td>6803.0</td>\n",
       "      <td>14.47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2017-07-28</td>\n",
       "      <td>208.00</td>\n",
       "      <td>213.95</td>\n",
       "      <td>208.0</td>\n",
       "      <td>211.95</td>\n",
       "      <td>211.25</td>\n",
       "      <td>2023.0</td>\n",
       "      <td>4.28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2017-07-27</td>\n",
       "      <td>213.05</td>\n",
       "      <td>215.30</td>\n",
       "      <td>209.0</td>\n",
       "      <td>210.95</td>\n",
       "      <td>210.50</td>\n",
       "      <td>6714.0</td>\n",
       "      <td>14.23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Date    Open    High    Low    Last   Close  Total Trade Quantity  \\\n",
       "0  2017-08-02  209.10  209.95  204.1  205.80  206.70                2935.0   \n",
       "1  2017-08-01  212.00  214.95  208.3  208.30  209.15                5094.0   \n",
       "2  2017-07-31  212.00  215.00  210.2  210.35  212.00                6803.0   \n",
       "3  2017-07-28  208.00  213.95  208.0  211.95  211.25                2023.0   \n",
       "4  2017-07-27  213.05  215.30  209.0  210.95  210.50                6714.0   \n",
       "\n",
       "   Turnover (Lacs)  \n",
       "0             6.09  \n",
       "1            10.78  \n",
       "2            14.47  \n",
       "3             4.28  \n",
       "4            14.23  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import numpy as np\n",
    "from sklearn import metrics\n",
    "\n",
    "df_hitech = pd.read_csv(\"NSE-HITECHCORP.csv\")\n",
    "df_hitech.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "below are the shapes: \n",
      "(51, 8) (55, 8) (53, 8)\n"
     ]
    }
   ],
   "source": [
    "df_bhagyanar = pd.read_csv(\"NSE-BHAGYANGR.csv\")\n",
    "\n",
    "df_hudco = pd.read_csv(\"NSE-HUDCO.csv\")\n",
    "\n",
    "print('below are the shapes: ')\n",
    "print(df_hitech.shape, df_bhagyanar.shape, df_hudco.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  209.1    209.95   204.1    205.8    206.7   2935.       6.09]\n"
     ]
    }
   ],
   "source": [
    "column_names = ['Open', 'High', 'Low', 'Last', 'Close', 'Total Trade Quantity', 'Turnover (Lacs)']\n",
    "\n",
    "X_hitech = df_hitech.loc[:, column_names].values\n",
    "X_bhagyanagar = df_bhagyanar.loc[:, column_names].values\n",
    "X_hudco = df_hudco.loc[:, column_names].values\n",
    "\n",
    "X = np.concatenate([X_hitech, X_bhagyanagar, X_hudco], axis=0)\n",
    "\n",
    "print(X[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "some samples of y\n",
      "[1000 1000 1000 1000 1000]\n",
      "first x shape and then y shape\n",
      "shape of X: (159, 7)\n",
      "shape of y: (159,)\n",
      "[[  209.1    209.95   204.1    205.8    206.7   2935.       6.09]\n",
      " [  212.     214.95   208.3    208.3    209.15  5094.      10.78]]\n",
      "[1000 1000]\n"
     ]
    }
   ],
   "source": [
    "# print('we will define the y now')\n",
    "\n",
    "y = [1000] * 51 + [2000] * 55 + [3000] * 53\n",
    "y = np.asarray(y)\n",
    "\n",
    "print('some samples of y')\n",
    "print(y[:5])\n",
    "\n",
    "print('first x shape and then y shape')\n",
    "\n",
    "print('shape of X: {}'.format(X.shape))\n",
    "print('shape of y: {}'.format(y.shape))\n",
    "\n",
    "print(X[:2])\n",
    "print(y[:2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  6.83000000e+01   6.83000000e+01   6.78000000e+01   6.79000000e+01\n",
      "   6.80000000e+01   1.43373000e+06   9.75450000e+02]\n",
      "[ 3000.  3000.  1000.  1000.  2000.]\n",
      "[    225.4      231.       214.       220.7      220.65  126911.       283.93]\n",
      "[ 1000.  1000.  1000.  1000.  3000.]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def get_train_test(X, y, test_size):\n",
    "    \"\"\"Give the train and test from X and y.\n",
    "    \n",
    "    To do this first combine the matrices, then split them in ratio of test_size then strip out the X and y components.\n",
    "    \n",
    "    \"\"\"\n",
    "    y_reshaped = np.array([y.T])\n",
    "    combined_X_y = np.concatenate((X, y_reshaped.T), axis=1)\n",
    "    combined_Xy_train, combined_Xy_test = train_test_split(combined_X_y, test_size=test_size)\n",
    "\n",
    "    X_train, y_train_2d = np.split(combined_Xy_train, [-1], axis=1)\n",
    "    y_train = np.squeeze(y_train_2d)\n",
    "\n",
    "    X_test, y_test_2d = np.split(combined_Xy_test, [-1], axis=1)\n",
    "    y_test = np.squeeze(y_test_2d)\n",
    "    \n",
    "    return X_train, y_train, X_test, y_test\n",
    "    \n",
    "X_train, y_train, X_test, y_test = get_train_test(X, y, test_size=0.2)\n",
    "\n",
    "print(X_train[0])\n",
    "print(y_train[:5])\n",
    "\n",
    "print(X_test[0])\n",
    "print(y_test[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.05469505  0.          0.14321643  0.19745815  0.23498935  0.15445189\n",
      "  0.21518912]\n",
      "normal X_test predictions are below:\n",
      "[ 1000.  1000.  1000.  1000.  3000.]\n"
     ]
    }
   ],
   "source": [
    "clf = RandomForestClassifier(max_depth=2, random_state=0)\n",
    "clf.fit(X_train, y_train)\n",
    "print(clf.feature_importances_)\n",
    "\n",
    "print('normal X_test predictions are below:')\n",
    "\n",
    "normal_predictions = clf.predict(X_test)\n",
    "\n",
    "print(normal_predictions[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NOw we will perform PCA. To do that we need to first scale the dataset\n",
      "(127, 7)\n",
      "mean is:\n",
      "1.55855830679e-16\n"
     ]
    }
   ],
   "source": [
    "print(\"NOw we will perform PCA. To do that we need to first scale the dataset\")\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "X_train_scaled = StandardScaler().fit_transform(X_train)\n",
    "X_test_scaled = StandardScaler().fit_transform(X_test)\n",
    "\n",
    "mu = X_train_scaled.mean()\n",
    "\n",
    "print(X_train_scaled.shape)\n",
    "print('mean is:')\n",
    "print(mu)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "explained variance in train\n",
      "[ 5.0029508   1.98866868]\n",
      "classified feature importances\n",
      "[ 0.37365783  0.62634217]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=2)\n",
    "\n",
    "training_features = pca.fit_transform(X_train_scaled)\n",
    "print('explained variance in train')\n",
    "print(pca.explained_variance_)\n",
    "\n",
    "clf.fit(training_features, y_train)\n",
    "print('classified feature importances')\n",
    "print(clf.feature_importances_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trying out the prediction capabilities\n",
      "[[ 2.57645802  0.05547991]\n",
      " [ 2.48947141 -0.02405964]]\n",
      "explained variance in test\n",
      "[ 5.15421156  1.83697385]\n",
      "PCA predictions are below:\n",
      "[ 1000.  1000.  1000.  1000.  3000.]\n"
     ]
    }
   ],
   "source": [
    "print('trying out the prediction capabilities')\n",
    "test_features = pca.fit_transform(X_test_scaled)\n",
    "print(test_features[:2])\n",
    "\n",
    "print('explained variance in test')\n",
    "print(pca.explained_variance_)\n",
    "\n",
    "predictions = clf.predict(test_features)\n",
    "\n",
    "print('PCA predictions are below:')\n",
    "print(predictions[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "##############################################\n",
      "normal X_test predictions are below:\n",
      "[ 1000.  1000.  1000.  1000.  3000.]\n",
      "......................................\n",
      "PCA predictions are below:\n",
      "[ 1000.  1000.  1000.  1000.  3000.]\n",
      "here are the real values\n",
      "[ 1000.  1000.  1000.  1000.  3000.]\n",
      "################################################\n",
      "what is the prediction score...\n",
      "normal predictive score: 1.0\n",
      "pca predictive score: 0.96875\n"
     ]
    }
   ],
   "source": [
    "print('##############################################')\n",
    "print('normal X_test predictions are below:')\n",
    "print(normal_predictions[:5])\n",
    "print('......................................')\n",
    "print('PCA predictions are below:')\n",
    "print(predictions[:5])\n",
    "print('here are the real values')\n",
    "print(y_test[:5])\n",
    "\n",
    "print('################################################')\n",
    "print('what is the prediction score...')\n",
    "normal_pscore = metrics.accuracy_score(y_test, normal_predictions)\n",
    "pca_score = metrics.accuracy_score(y_test, predictions)\n",
    "print(\"normal predictive score: {}\".format(normal_pscore))\n",
    "print(\"pca predictive score: {}\".format(pca_score))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Date</th>\n",
	" <th>Open</th>\n",
	" <th>High</th>\n",
	" <th>Low</th>\n",
	" <th>Last</th>\n",
	" <th>Close</th>\n",
	" <th>Total Trade Quantity</th>\n",
	" <th>Turnover (Lacs)</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>2017-08-02</td>\n",
	" <td>209.10</td>\n",
	" <td>209.95</td>\n",
	" <td>204.1</td>\n",
	" <td>205.80</td>\n",
	" <td>206.70</td>\n",
	" <td>2935.0</td>\n",
	" <td>6.09</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2017-08-01</td>\n",
	" <td>212.00</td>\n",
	" <td>214.95</td>\n",
	" <td>208.3</td>\n",
	" <td>208.30</td>\n",
	" <td>209.15</td>\n",
	" <td>5094.0</td>\n",
	" <td>10.78</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2017-07-31</td>\n",
	" <td>212.00</td>\n",
	" <td>215.00</td>\n",
	" <td>210.2</td>\n",
	" <td>210.35</td>\n",
	" <td>212.00</td>\n",
	" <td>6803.0</td>\n",
	" <td>14.47</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>2017-07-28</td>\n",
	" <td>208.00</td>\n",
	" <td>213.95</td>\n",
	" <td>208.0</td>\n",
	" <td>211.95</td>\n",
	" <td>211.25</td>\n",
	" <td>2023.0</td>\n",
	" <td>4.28</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>2017-07-27</td>\n",
	" <td>213.05</td>\n",
	" <td>215.30</td>\n",
	" <td>209.0</td>\n",
	" <td>210.95</td>\n",
	" <td>210.50</td>\n",
	" <td>6714.0</td>\n",
	" <td>14.23</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Date Open High Low Last Close Total Trade Quantity \\\n",
	"0 2017-08-02 209.10 209.95 204.1 205.80 206.70 2935.0 \n",
	"1 2017-08-01 212.00 214.95 208.3 208.30 209.15 5094.0 \n",
	"2 2017-07-31 212.00 215.00 210.2 210.35 212.00 6803.0 \n",
	"3 2017-07-28 208.00 213.95 208.0 211.95 211.25 2023.0 \n",
	"4 2017-07-27 213.05 215.30 209.0 210.95 210.50 6714.0 \n",
	"\n",
	" Turnover (Lacs) \n",
	"0 6.09 \n",
	"1 10.78 \n",
	"2 14.47 \n",
	"3 4.28 \n",
	"4 14.23 "
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"from sklearn.ensemble import RandomForestClassifier\n",
	"import numpy as np\n",
	"from sklearn import metrics\n",
	"\n",
	"df_hitech = pd.read_csv(\"NSE-HITECHCORP.csv\")\n",
	"df_hitech.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"below are the shapes: \n",
	"(51, 8) (55, 8) (53, 8)\n"
	]
	}
	],
	"source": [
	"df_bhagyanar = pd.read_csv(\"NSE-BHAGYANGR.csv\")\n",
	"\n",
	"df_hudco = pd.read_csv(\"NSE-HUDCO.csv\")\n",
	"\n",
	"print('below are the shapes: ')\n",
	"print(df_hitech.shape, df_bhagyanar.shape, df_hudco.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[ 209.1 209.95 204.1 205.8 206.7 2935. 6.09]\n"
	]
	}
	],
	"source": [
	"column_names = ['Open', 'High', 'Low', 'Last', 'Close', 'Total Trade Quantity', 'Turnover (Lacs)']\n",
	"\n",
	"X_hitech = df_hitech.loc[:, column_names].values\n",
	"X_bhagyanagar = df_bhagyanar.loc[:, column_names].values\n",
	"X_hudco = df_hudco.loc[:, column_names].values\n",
	"\n",
	"X = np.concatenate([X_hitech, X_bhagyanagar, X_hudco], axis=0)\n",
	"\n",
	"print(X[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"some samples of y\n",
	"[1000 1000 1000 1000 1000]\n",
	"first x shape and then y shape\n",
	"shape of X: (159, 7)\n",
	"shape of y: (159,)\n",
	"[[ 209.1 209.95 204.1 205.8 206.7 2935. 6.09]\n",
	" [ 212. 214.95 208.3 208.3 209.15 5094. 10.78]]\n",
	"[1000 1000]\n"
	]
	}
	],
	"source": [
	"# print('we will define the y now')\n",
	"\n",
	"y = [1000] * 51 + [2000] * 55 + [3000] * 53\n",
	"y = np.asarray(y)\n",
	"\n",
	"print('some samples of y')\n",
	"print(y[:5])\n",
	"\n",
	"print('first x shape and then y shape')\n",
	"\n",
	"print('shape of X: {}'.format(X.shape))\n",
	"print('shape of y: {}'.format(y.shape))\n",
	"\n",
	"print(X[:2])\n",
	"print(y[:2])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[ 6.83000000e+01 6.83000000e+01 6.78000000e+01 6.79000000e+01\n",
	" 6.80000000e+01 1.43373000e+06 9.75450000e+02]\n",
	"[ 3000. 3000. 1000. 1000. 2000.]\n",
	"[ 225.4 231. 214. 220.7 220.65 126911. 283.93]\n",
	"[ 1000. 1000. 1000. 1000. 3000.]\n"
	]
	}
	],
	"source": [
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"def get_train_test(X, y, test_size):\n",
	" \"\"\"Give the train and test from X and y.\n",
	" \n",
	" To do this first combine the matrices, then split them in ratio of test_size then strip out the X and y components.\n",
	" \n",
	" \"\"\"\n",
	" y_reshaped = np.array([y.T])\n",
	" combined_X_y = np.concatenate((X, y_reshaped.T), axis=1)\n",
	" combined_Xy_train, combined_Xy_test = train_test_split(combined_X_y, test_size=test_size)\n",
	"\n",
	" X_train, y_train_2d = np.split(combined_Xy_train, [-1], axis=1)\n",
	" y_train = np.squeeze(y_train_2d)\n",
	"\n",
	" X_test, y_test_2d = np.split(combined_Xy_test, [-1], axis=1)\n",
	" y_test = np.squeeze(y_test_2d)\n",
	" \n",
	" return X_train, y_train, X_test, y_test\n",
	" \n",
	"X_train, y_train, X_test, y_test = get_train_test(X, y, test_size=0.2)\n",
	"\n",
	"print(X_train[0])\n",
	"print(y_train[:5])\n",
	"\n",
	"print(X_test[0])\n",
	"print(y_test[:5])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[ 0.05469505 0. 0.14321643 0.19745815 0.23498935 0.15445189\n",
	" 0.21518912]\n",
	"normal X_test predictions are below:\n",
	"[ 1000. 1000. 1000. 1000. 3000.]\n"
	]
	}
	],
	"source": [
	"clf = RandomForestClassifier(max_depth=2, random_state=0)\n",
	"clf.fit(X_train, y_train)\n",
	"print(clf.feature_importances_)\n",
	"\n",
	"print('normal X_test predictions are below:')\n",
	"\n",
	"normal_predictions = clf.predict(X_test)\n",
	"\n",
	"print(normal_predictions[:5])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"NOw we will perform PCA. To do that we need to first scale the dataset\n",
	"(127, 7)\n",
	"mean is:\n",
	"1.55855830679e-16\n"
	]
	}
	],
	"source": [
	"print(\"NOw we will perform PCA. To do that we need to first scale the dataset\")\n",
	"from sklearn.preprocessing import StandardScaler\n",
	"\n",
	"X_train_scaled = StandardScaler().fit_transform(X_train)\n",
	"X_test_scaled = StandardScaler().fit_transform(X_test)\n",
	"\n",
	"mu = X_train_scaled.mean()\n",
	"\n",
	"print(X_train_scaled.shape)\n",
	"print('mean is:')\n",
	"print(mu)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"explained variance in train\n",
	"[ 5.0029508 1.98866868]\n",
	"classified feature importances\n",
	"[ 0.37365783 0.62634217]\n"
	]
	}
	],
	"source": [
	"from sklearn.decomposition import PCA\n",
	"pca = PCA(n_components=2)\n",
	"\n",
	"training_features = pca.fit_transform(X_train_scaled)\n",
	"print('explained variance in train')\n",
	"print(pca.explained_variance_)\n",
	"\n",
	"clf.fit(training_features, y_train)\n",
	"print('classified feature importances')\n",
	"print(clf.feature_importances_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"trying out the prediction capabilities\n",
	"[[ 2.57645802 0.05547991]\n",
	" [ 2.48947141 -0.02405964]]\n",
	"explained variance in test\n",
	"[ 5.15421156 1.83697385]\n",
	"PCA predictions are below:\n",
	"[ 1000. 1000. 1000. 1000. 3000.]\n"
	]
	}
	],
	"source": [
	"print('trying out the prediction capabilities')\n",
	"test_features = pca.fit_transform(X_test_scaled)\n",
	"print(test_features[:2])\n",
	"\n",
	"print('explained variance in test')\n",
	"print(pca.explained_variance_)\n",
	"\n",
	"predictions = clf.predict(test_features)\n",
	"\n",
	"print('PCA predictions are below:')\n",
	"print(predictions[:5])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"##############################################\n",
	"normal X_test predictions are below:\n",
	"[ 1000. 1000. 1000. 1000. 3000.]\n",
	"......................................\n",
	"PCA predictions are below:\n",
	"[ 1000. 1000. 1000. 1000. 3000.]\n",
	"here are the real values\n",
	"[ 1000. 1000. 1000. 1000. 3000.]\n",
	"################################################\n",
	"what is the prediction score...\n",
	"normal predictive score: 1.0\n",
	"pca predictive score: 0.96875\n"
	]
	}
	],
	"source": [
	"print('##############################################')\n",
	"print('normal X_test predictions are below:')\n",
	"print(normal_predictions[:5])\n",
	"print('......................................')\n",
	"print('PCA predictions are below:')\n",
	"print(predictions[:5])\n",
	"print('here are the real values')\n",
	"print(y_test[:5])\n",
	"\n",
	"print('################################################')\n",
	"print('what is the prediction score...')\n",
	"normal_pscore = metrics.accuracy_score(y_test, normal_predictions)\n",
	"pca_score = metrics.accuracy_score(y_test, predictions)\n",
	"print(\"normal predictive score: {}\".format(normal_pscore))\n",
	"print(\"pca predictive score: {}\".format(pca_score))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}