Andrew9255/SageMaker_demo.ipynb

## SageMaker_demo.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "isConfigCell": true,
    "nbpresent": {
     "id": "6427e831-8f89-45c0-b150-0b134397d79a"
    },
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import boto3\n",
    "import re\n",
    "from sagemaker import get_execution_role\n",
    "\n",
    "role = get_execution_role()\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "bucket='5003msbdandrengaidataset'\n",
    "data_key = 'awstest.csv'\n",
    "data_location = 's3://{}/{}'.format(bucket, data_key)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "nbpresent": {
     "id": "bb88eea9-27f3-4e47-9133-663911ea09a9"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import io\n",
    "import time\n",
    "import json\n",
    "import sagemaker.amazon.common as smac"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "nbpresent": {
     "id": "f8976dad-6897-4c7e-8c95-ae2f53070ef5"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1000, 7)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>month</th>\n",
       "      <th>day</th>\n",
       "      <th>weekday</th>\n",
       "      <th>DISTANCE</th>\n",
       "      <th>f0_</th>\n",
       "      <th>dealyTF</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>7</td>\n",
       "      <td>1246</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>24</td>\n",
       "      <td>7</td>\n",
       "      <td>337</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>1476</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>19</td>\n",
       "      <td>6</td>\n",
       "      <td>1476</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "      <td>679</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  month  day  weekday  DISTANCE  f0_  dealyTF\n",
       "0           0      3   10        7      1246    6        1\n",
       "1           1      3   24        7       337    6        1\n",
       "2           2      3   20        3      1476    6        0\n",
       "3           3      1   19        6      1476    6        0\n",
       "4           4      3   12        2       679    6        0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>month</th>\n",
       "      <th>day</th>\n",
       "      <th>weekday</th>\n",
       "      <th>DISTANCE</th>\n",
       "      <th>f0_</th>\n",
       "      <th>dealyTF</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>499.500000</td>\n",
       "      <td>2.132000</td>\n",
       "      <td>15.947000</td>\n",
       "      <td>4.043000</td>\n",
       "      <td>1701.183000</td>\n",
       "      <td>14.259000</td>\n",
       "      <td>0.568000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>288.819436</td>\n",
       "      <td>0.846932</td>\n",
       "      <td>8.580432</td>\n",
       "      <td>1.864035</td>\n",
       "      <td>904.867268</td>\n",
       "      <td>4.854017</td>\n",
       "      <td>0.495602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>236.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>249.750000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>679.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>499.500000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>2288.000000</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>749.250000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2475.000000</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>999.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>31.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>2704.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0        month          day      weekday     DISTANCE  \\\n",
       "count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   \n",
       "mean    499.500000     2.132000    15.947000     4.043000  1701.183000   \n",
       "std     288.819436     0.846932     8.580432     1.864035   904.867268   \n",
       "min       0.000000     1.000000     1.000000     1.000000   236.000000   \n",
       "25%     249.750000     1.000000     9.000000     3.000000   679.000000   \n",
       "50%     499.500000     2.000000    16.000000     4.000000  2288.000000   \n",
       "75%     749.250000     3.000000    22.000000     6.000000  2475.000000   \n",
       "max     999.000000     3.000000    31.000000     7.000000  2704.000000   \n",
       "\n",
       "               f0_      dealyTF  \n",
       "count  1000.000000  1000.000000  \n",
       "mean     14.259000     0.568000  \n",
       "std       4.854017     0.495602  \n",
       "min       6.000000     0.000000  \n",
       "25%      10.000000     0.000000  \n",
       "50%      15.000000     1.000000  \n",
       "75%      18.000000     1.000000  \n",
       "max      23.000000     1.000000  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "data = pd.read_csv(data_location)\n",
    "\n",
    "\n",
    "data.to_csv(\"data.csv\", sep=',', index=False)\n",
    "\n",
    "# print the shape of the data file\n",
    "print(data.shape)\n",
    "\n",
    "# show the top few rows\n",
    "display(data.head())\n",
    "\n",
    "# describe the data object\n",
    "display(data.describe())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/__main__.py:10: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/__main__.py:11: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/__main__.py:18: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/__main__.py:19: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/__main__.py:21: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/__main__.py:22: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n"
     ]
    }
   ],
   "source": [
    "rand_split = np.random.rand(len(data))\n",
    "train_list = rand_split < 0.8\n",
    "val_list = (rand_split >= 0.8) & (rand_split < 0.9)\n",
    "test_list = rand_split >= 0.9\n",
    "\n",
    "data_train = data[train_list]\n",
    "data_val = data[val_list]\n",
    "data_test = data[test_list]\n",
    "\n",
    "train_y = ((data_train.iloc[:,6])).as_matrix();\n",
    "train_X = (data_train.iloc[:,1:6]).as_matrix();\n",
    "\n",
    "\n",
    "\n",
    "# train_y = ((data_train.iloc[:,1] == 'M') +0).as_matrix();\n",
    "# train_X = data_train.iloc[:,2:].as_matrix();\n",
    "\n",
    "val_y = ((data_val.iloc[:,6]).as_matrix())\n",
    "val_X = data_val.iloc[:,1:6].as_matrix()\n",
    "\n",
    "test_y = ((data_test.iloc[:,6]).as_matrix())\n",
    "test_X = data_test.iloc[:,1:6].as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "nbpresent": {
     "id": "cd8e3431-79d9-40b6-91d1-d67cd61894e7"
    }
   },
   "outputs": [],
   "source": [
    "train_file = 'linear_train.data'\n",
    "\n",
    "f = io.BytesIO()\n",
    "smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))\n",
    "f.seek(0)\n",
    "\n",
    "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_file)).upload_fileobj(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "nbpresent": {
     "id": "bd113b8e-adc1-4091-a26f-a426149fe604"
    }
   },
   "outputs": [],
   "source": [
    "validation_file = 'linear_validation.data'\n",
    "\n",
    "f = io.BytesIO()\n",
    "smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))\n",
    "f.seek(0)\n",
    "\n",
    "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', validation_file)).upload_fileobj(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "f3b125ad-a2d5-464c-8cfa-bd203034eee4"
    }
   },
   "source": [
    "---\n",
    "## Train\n",
    "\n",
    "Now we can begin to specify our linear model.  Amazon SageMaker's Linear Learner actually fits many models in parallel, each with slightly different hyperparameters, and then returns the one with the best fit.  This functionality is automatically enabled.  We can influence this using parameters like:\n",
    "\n",
    "- `num_models` to increase to total number of models run.  The specified parameters will always be one of those models, but the algorithm also chooses models with nearby parameter values in order to find a solution nearby that may be more optimal.  In this case, we're going to use the max of 32.\n",
    "- `loss` which controls how we penalize mistakes in our model estimates.  For this case, let's use absolute loss as we haven't spent much time cleaning the data, and absolute loss will be less sensitive to outliers.\n",
    "- `wd` or `l1` which control regularization.  Regularization can prevent model overfitting by preventing our estimates from becoming too finely tuned to the training data, which can actually hurt generalizability.  In this case, we'll leave these parameters as their default \"auto\" though."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Specify container images used for training and hosting SageMaker's linear-learner"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See 'Algorithms Provided by Amazon SageMaker: Common Parameters' in the SageMaker documentation for an explanation of these values.\n",
    "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
    "container = get_image_uri(boto3.Session().region_name, 'linear-learner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "nbpresent": {
     "id": "397fb60a-c48b-453f-88ea-4d832b70c919"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Job name is: DEMO-linear-2019-11-05-12-06-56\n"
     ]
    }
   ],
   "source": [
    "linear_job = 'DEMO-linear-' + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n",
    "\n",
    "\n",
    "\n",
    "print(\"Job name is:\", linear_job)\n",
    "\n",
    "linear_training_params = {\n",
    "    \"RoleArn\": role,\n",
    "    \"TrainingJobName\": linear_job,\n",
    "    \"AlgorithmSpecification\": {\n",
    "        \"TrainingImage\": container,\n",
    "        \"TrainingInputMode\": \"File\"\n",
    "    },\n",
    "    \"ResourceConfig\": {\n",
    "        \"InstanceCount\": 1,\n",
    "        \"InstanceType\": \"ml.c4.2xlarge\",\n",
    "        \"VolumeSizeInGB\": 10\n",
    "    },\n",
    "    \"InputDataConfig\": [\n",
    "        {\n",
    "            \"ChannelName\": \"train\",\n",
    "            \"DataSource\": {\n",
    "                \"S3DataSource\": {\n",
    "                    \"S3DataType\": \"S3Prefix\",\n",
    "                    \"S3Uri\": \"s3://{}/{}/train/\".format(bucket, prefix),\n",
    "                    \"S3DataDistributionType\": \"ShardedByS3Key\"\n",
    "                }\n",
    "            },\n",
    "            \"CompressionType\": \"None\",\n",
    "            \"RecordWrapperType\": \"None\"\n",
    "        },\n",
    "        {\n",
    "            \"ChannelName\": \"validation\",\n",
    "            \"DataSource\": {\n",
    "                \"S3DataSource\": {\n",
    "                    \"S3DataType\": \"S3Prefix\",\n",
    "                    \"S3Uri\": \"s3://{}/{}/validation/\".format(bucket, prefix),\n",
    "                    \"S3DataDistributionType\": \"FullyReplicated\"\n",
    "                }\n",
    "            },\n",
    "            \"CompressionType\": \"None\",\n",
    "            \"RecordWrapperType\": \"None\"\n",
    "        }\n",
    "\n",
    "    ],\n",
    "    \"OutputDataConfig\": {\n",
    "        \"S3OutputPath\": \"s3://{}/{}/\".format(bucket, prefix)\n",
    "    },\n",
    "    \"HyperParameters\": {\n",
    "        \"feature_dim\": \"5\",\n",
    "        \"mini_batch_size\": \"100\",\n",
    "        \"predictor_type\": \"regressor\",\n",
    "        \"epochs\": \"10\",\n",
    "        \"num_models\": \"32\",\n",
    "        \"loss\": \"absolute_loss\"\n",
    "    },\n",
    "    \"StoppingCondition\": {\n",
    "        \"MaxRuntimeInSeconds\": 60 * 60\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's kick off our training job in SageMaker's distributed, managed training, using the parameters we just created.  Because training is managed, we don't have to wait for our job to finish to continue, but for this case, let's use boto3's 'training_job_completed_or_stopped' waiter so we can ensure that the job has been started."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "InProgress\n",
      "CPU times: user 68.1 ms, sys: 4.34 ms, total: 72.4 ms\n",
      "Wall time: 4min\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "region = boto3.Session().region_name\n",
    "sm = boto3.client('sagemaker')\n",
    "\n",
    "sm.create_training_job(**linear_training_params)\n",
    "\n",
    "status = sm.describe_training_job(TrainingJobName=linear_job)['TrainingJobStatus']\n",
    "print(status)\n",
    "sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=linear_job)\n",
    "if status == 'Failed':\n",
    "    message = sm.describe_training_job(TrainingJobName=linear_job)['FailureReason']\n",
    "    print('Training failed with the following error: {}'.format(message))\n",
    "    raise Exception('Training job failed')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "2adcc348-9ab5-4a8a-8139-d0ecd740208a"
    }
   },
   "source": [
    "---\n",
    "## Host\n",
    "\n",
    "Now that we've trained the linear algorithm on our data, let's setup a model which can later be hosted.  We will:\n",
    "1. Point to the scoring container\n",
    "1. Point to the model.tar.gz that came from training\n",
    "1. Create the hosting model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "nbpresent": {
     "id": "c88fb868-01d2-4991-8953-28814c022bdc"
    }
   },
   "outputs": [
    {
     "ename": "ClientError",
     "evalue": "An error occurred (ValidationException) when calling the CreateModel operation: Cannot create already existing model \"arn:aws:sagemaker:us-east-2:230914795427:model/demo-linear-2019-11-05-12-06-56\".",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mClientError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-22-0f1fff39f00f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0mModelName\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlinear_job\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m     \u001b[0mExecutionRoleArn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrole\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m     PrimaryContainer=linear_hosting_container)\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcreate_model_response\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ModelArn'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/botocore/client.py\u001b[0m in \u001b[0;36m_api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    355\u001b[0m                     \"%s() only accepts keyword arguments.\" % py_operation_name)\n\u001b[1;32m    356\u001b[0m             \u001b[0;31m# The \"self\" in this scope is referring to the BaseClient.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_api_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moperation_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    358\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    359\u001b[0m         \u001b[0m_api_call\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpy_operation_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/botocore/client.py\u001b[0m in \u001b[0;36m_make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m    659\u001b[0m             \u001b[0merror_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsed_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Error\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Code\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    660\u001b[0m             \u001b[0merror_class\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_code\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_code\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 661\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0merror_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_response\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moperation_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    662\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    663\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mparsed_response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mClientError\u001b[0m: An error occurred (ValidationException) when calling the CreateModel operation: Cannot create already existing model \"arn:aws:sagemaker:us-east-2:230914795427:model/demo-linear-2019-11-05-12-06-56\"."
     ]
    }
   ],
   "source": [
    "linear_hosting_container = {\n",
    "    'Image': container,\n",
    "    'ModelDataUrl': sm.describe_training_job(TrainingJobName=linear_job)['ModelArtifacts']['S3ModelArtifacts']\n",
    "}\n",
    "\n",
    "create_model_response = sm.create_model(\n",
    "    ModelName=linear_job,\n",
    "    ExecutionRoleArn=role,\n",
    "    PrimaryContainer=linear_hosting_container)\n",
    "\n",
    "print(create_model_response['ModelArn'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once we've setup a model, we can configure what our hosting endpoints should be.  Here we specify:\n",
    "1. EC2 instance type to use for hosting\n",
    "1. Initial number of instances\n",
    "1. Our hosting model name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEMO-linear-endpoint-config-2019-11-05-12-14-27\n",
      "Endpoint Config Arn: arn:aws:sagemaker:us-east-2:230914795427:endpoint-config/demo-linear-endpoint-config-2019-11-05-12-14-27\n"
     ]
    }
   ],
   "source": [
    "linear_endpoint_config = 'DEMO-linear-endpoint-config-' + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n",
    "print(linear_endpoint_config)\n",
    "create_endpoint_config_response = sm.create_endpoint_config(\n",
    "    EndpointConfigName=linear_endpoint_config,\n",
    "    ProductionVariants=[{\n",
    "        'InstanceType': 'ml.t2.medium',\n",
    "        'InitialInstanceCount': 1,\n",
    "        'ModelName': linear_job,\n",
    "        'VariantName': 'AllTraffic'}])\n",
    "\n",
    "print(\"Endpoint Config Arn: \" + create_endpoint_config_response['EndpointConfigArn'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now that we've specified how our endpoint should be configured, we can create them.  This can be done in the background, but for now let's run a loop that updates us on the status of the endpoints so that we know when they are ready for use."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEMO-linear-endpoint-201911051214\n",
      "arn:aws:sagemaker:us-east-2:230914795427:endpoint/demo-linear-endpoint-201911051214\n",
      "Status: Creating\n",
      "Arn: arn:aws:sagemaker:us-east-2:230914795427:endpoint/demo-linear-endpoint-201911051214\n",
      "Status: InService\n",
      "CPU times: user 265 ms, sys: 26.5 ms, total: 291 ms\n",
      "Wall time: 10min 32s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "linear_endpoint = 'DEMO-linear-endpoint-' + time.strftime(\"%Y%m%d%H%M\", time.gmtime())\n",
    "print(linear_endpoint)\n",
    "create_endpoint_response = sm.create_endpoint(\n",
    "    EndpointName=linear_endpoint,\n",
    "    EndpointConfigName=linear_endpoint_config)\n",
    "print(create_endpoint_response['EndpointArn'])\n",
    "\n",
    "resp = sm.describe_endpoint(EndpointName=linear_endpoint)\n",
    "status = resp['EndpointStatus']\n",
    "print(\"Status: \" + status)\n",
    "\n",
    "sm.get_waiter('endpoint_in_service').wait(EndpointName=linear_endpoint)\n",
    "\n",
    "resp = sm.describe_endpoint(EndpointName=linear_endpoint)\n",
    "status = resp['EndpointStatus']\n",
    "print(\"Arn: \" + resp['EndpointArn'])\n",
    "print(\"Status: \" + status)\n",
    "\n",
    "if status != 'InService':\n",
    "    raise Exception('Endpoint creation did not succeed')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict\n",
    "### Predict on Test Data\n",
    "\n",
    "Now that we have our hosted endpoint, we can generate statistical predictions from it.  Let's predict on our test dataset to understand how accurate our model is.\n",
    "\n",
    "There are many metrics to measure classification accuracy.  Common examples include include:\n",
    "- Precision\n",
    "- Recall\n",
    "- F1 measure\n",
    "- Area under the ROC curve - AUC\n",
    "- Total Classification Accuracy \n",
    "- Mean Absolute Error\n",
    "\n",
    "For our example, we'll keep things simple and use total classification accuracy as our metric of choice. We will also evaluate  Mean Absolute  Error (MAE) as the linear-learner has been optimized using this metric, not necessarily because it is a relevant metric from an application point of view. We'll compare the performance of the linear-learner against a naive benchmark prediction which uses majority class observed in the training data set for prediction on the test data.\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Function to convert an array to a csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def np2csv(arr):\n",
    "    csv = io.BytesIO()\n",
    "    np.savetxt(csv, arr, delimiter=',', fmt='%g')\n",
    "    return csv.getvalue().decode().rstrip()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we'll invoke the endpoint to get predictions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "runtime= boto3.client('runtime.sagemaker')\n",
    "\n",
    "payload = np2csv(test_X)\n",
    "response = runtime.invoke_endpoint(EndpointName=linear_endpoint,\n",
    "                                   ContentType='text/csv',\n",
    "                                   Body=payload)\n",
    "result = json.loads(response['Body'].read().decode())\n",
    "test_pred = np.array([r['score'] for r in result['predictions']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's compare linear learner based mean absolute prediction errors from a baseline prediction which uses majority class to predict every instance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test MAE Baseline : 0.411\n",
      "Test MAE Linear: 0.4\n"
     ]
    }
   ],
   "source": [
    "test_mae_linear = np.mean(np.abs(test_y - test_pred))\n",
    "test_mae_baseline = np.mean(np.abs(test_y - np.median(train_y))) ## training median as baseline predictor\n",
    "\n",
    "print(\"Test MAE Baseline :\", round(test_mae_baseline, 3))\n",
    "print(\"Test MAE Linear:\", round(test_mae_linear,3))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's compare predictive accuracy using a classification threshold of 0.5 for the predicted and compare against the majority class prediction from training data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prediction Accuracy: 62.2 %\n",
      "Baseline Accuracy: 58.9 %\n"
     ]
    }
   ],
   "source": [
    "test_pred_class = (test_pred > 0.5)+0;\n",
    "test_pred_baseline = np.repeat(np.median(train_y), len(test_y))\n",
    "\n",
    "prediction_accuracy = np.mean((test_y == test_pred_class))*100\n",
    "baseline_accuracy = np.mean((test_y == test_pred_baseline))*100\n",
    "\n",
    "print(\"Prediction Accuracy:\", round(prediction_accuracy,1), \"%\")\n",
    "print(\"Baseline Accuracy:\", round(baseline_accuracy,1), \"%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Run the cell below to delete endpoint once you are done."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ResponseMetadata': {'RequestId': '1cb5bb74-55c7-4de7-bc59-5344c998a4b2',\n",
       "  'HTTPStatusCode': 200,\n",
       "  'HTTPHeaders': {'x-amzn-requestid': '1cb5bb74-55c7-4de7-bc59-5344c998a4b2',\n",
       "   'content-type': 'application/x-amz-json-1.1',\n",
       "   'content-length': '0',\n",
       "   'date': 'Tue, 05 Nov 2019 13:21:27 GMT'},\n",
       "  'RetryAttempts': 0}}"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sm.delete_endpoint(EndpointName=linear_endpoint)"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.  Licensed under the Apache License, Version 2.0 (the License). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the license file accompanying this file. This file is distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
 },
 "nbformat": 4,
 "nbformat_minor": 2
}