petersen-poul/Active Learning - Diabetes Example

## Active Learning - Diabetes Example
{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "This iPython Notebook uses the BigML API and the diabetes dataset to provide a simplified demonstration of Active Learning."
     ]
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "The goal is to create the best possible model for predicting diabetes using only 5 instances out of the 768 in the dataset. The first method is to try just a random sample of 5 instances. The second method is to use clustering to group the instances by similarities in the diagnostic measurements into 5 clusters, and then use the data point closet to each centroid as the sample points for modeling. For both cases, an evaluation against the entire dataset is performed as well."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import bigml, csv, StringIO\n",
      "from bigml.api import BigML\n",
      "\n",
      "# You need to define BIGML_USERNAME and BIGML_API_KEY in your environment settings, or add them here as options\n",
      "# api = BigML(username, api_key)\n",
      "\n",
      "api = BigML()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a the diabetes source file by fetching the data from s3. Note that we wrap all these API calls with \n",
      "# api.check_resource so that each step runs synchronously\n",
      "\n",
      "diabetes_source = api.check_resource(\n",
      "    api.create_source(\"s3://bigml-public/arff/diabetes.arff\", {\"name\": \"Diabetes\"}),\n",
      "    api.get_source)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create the dataset, but remove the \"weight\" field which is id 000009\n",
      "\n",
      "diabetes_dataset = api.check_resource(\n",
      "    api.create_dataset(diabetes_source, {\"excluded_fields\": [ \"000009\" ] }),\n",
      "    api.get_dataset)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create the random sample by sampling over the entire dataset but using a range from 1 to 5\n",
      "\n",
      "diabetes_random_sample = api.check_resource(\n",
      "    api.create_dataset(diabetes_dataset, {\"sample_rate\": 1, \"range\": [1,5], \"name\": \"Diabetes Random Sample\"}),\n",
      "    api.get_dataset)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now we create the model using the random sample\n",
      "\n",
      "diabetes_random_model = api.check_resource(\n",
      "    api.create_model(diabetes_random_sample), \n",
      "    api.get_model)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# And finally, evaluate the performance of the random sample model using the entire dataset\n",
      "\n",
      "diabetes_random_eval = api.check_resource(\n",
      "    api.create_evaluation(diabetes_random_model, diabetes_dataset),\n",
      "    api.get_evaluation)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Here we start the Active Learning demonstration. The first step is to cluster the instances in the diabetes dataset. \n",
      "# However, we have to ignore the class (which we want to predict) by scaling it to 0\n",
      "\n",
      "diabetes_cluster = api.check_resource(\n",
      "    api.create_cluster(diabetes_dataset, { \"k\": 5, \"field_scales\": { \"class\": 0} }),\n",
      "    api.get_cluster)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# We need to assign a centroid and distance for each datapoint in the dataset using the cluster. This generates \n",
      "# A CSV which we will fetch next. Note that we are keeping all fields in the CSV output, as well as the header and the\n",
      "# distance score. This will allow us to sample from this CSV to create a new source/dataset/model\n",
      "\n",
      "diabetes_batchcentroid = api.check_resource(\n",
      "    api.create_batch_centroid(diabetes_cluster, diabetes_dataset, { \"all_fields\": True, \"header\": True, \"distance\": True, \"distance_name\": \"distance\" }),\n",
      "    api.get_batch_centroid)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Fetch the centroid scores and use the csv module to parse them into list of dictionaries. This \n",
      "# dataset is small, so we are going to just do all the CSV steps in memory rather than write to disk \n",
      "# and worry about handling files\n",
      "\n",
      "centroid_scores = csv.DictReader(api.download_batch_centroid(diabetes_batchcentroid))\n",
      "centroid_samples = {}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now we examine each row in the CSV and build up a dictionary mapping the cluster name (Ex: \"Cluster 0\") to a \n",
      "# sampled row. The idea here is to find the row (which is an instance) that is closest to the centroid for each cluster. \n",
      "# All we need to do is compare the value in the \"distance\" field and keep the smallest.\n",
      "\n",
      "for row in centroid_scores:\n",
      "    if not row['cluster'] in centroid_samples:\n",
      "        centroid_samples[row['cluster']] = row\n",
      "    else:\n",
      "        if row[\"distance\"] < centroid_samples[row['cluster']][\"distance\"]:\n",
      "            centroid_samples[row['cluster']] = row"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Rather than write to disk, we create an in-memory CSV using StringIO\n",
      "# We write a header to the CSV using the field names from the centroid_scores CSV we downloaded previously\n",
      "\n",
      "CSV = StringIO.StringIO()\n",
      "buffer = csv.DictWriter(CSV, centroid_scores.fieldnames)\n",
      "buffer.writeheader()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# For each of the rows with the minimum distance in centroid_samples, we right the rows to the in-memory CSV.\n",
      "# These will be our intelligently chosen samples.\n",
      "\n",
      "for cluster in centroid_samples:\n",
      "    buffer.writerow(centroid_samples[cluster])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now we repeat the source/dataset/model/eval steps for the cluster samples. \n",
      "# First step: create the source from the in-memory CSV\n",
      "\n",
      "diabetes_cluster_source = api.check_resource(\n",
      "    api.create_source(CSV, { \"name\": \"Diabetes Cluster Sample\" }),\n",
      "    api.get_source)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "                                                  "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now create the dataset. We need to drop the cluster assignment and distance fields.\n",
      "\n",
      "diabetes_cluster_sample = api.check_resource(\n",
      "    api.create_dataset(diabetes_cluster_source, {\"excluded_fields\": [ \"000009\", \"00000a\" ], \"name\": \"Diabetes Cluster Sample\" }),\n",
      "    api.get_dataset)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Model the cluster sample dataset\n",
      "\n",
      "diabetes_cluster_model = api.check_resource(\n",
      "    api.create_model(diabetes_cluster_sample), \n",
      "    api.get_model)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Last step, evaluate this model against the original dataset\n",
      "\n",
      "diabetes_cluster_eval = api.check_resource(\n",
      "    api.create_evaluation(diabetes_cluster_model, diabetes_dataset),\n",
      "    api.get_evaluation)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Simple comparison of the accuracy. It's worth comparing these evaluations side-by-side in the bigml.com UI. \n",
      "# All 5 metrics should be significantly better. \n",
      "\n",
      "print \"Random Sample Accuracy: %s\" % diabetes_random_eval['object']['result']['model']['accuracy']\n",
      "print \"Cluster Sample Accuracy: %s\" % diabetes_cluster_eval['object']['result']['model']['accuracy']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Random Sample Accuracy: 0.65495\n",
        "Cluster Sample Accuracy: 0.73568\n"
       ]
      }
     ],
     "prompt_number": 17
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"This iPython Notebook uses the BigML API and the diabetes dataset to provide a simplified demonstration of Active Learning."
	]
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"The goal is to create the best possible model for predicting diabetes using only 5 instances out of the 768 in the dataset. The first method is to try just a random sample of 5 instances. The second method is to use clustering to group the instances by similarities in the diagnostic measurements into 5 clusters, and then use the data point closet to each centroid as the sample points for modeling. For both cases, an evaluation against the entire dataset is performed as well."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import bigml, csv, StringIO\n",
	"from bigml.api import BigML\n",
	"\n",
	"# You need to define BIGML_USERNAME and BIGML_API_KEY in your environment settings, or add them here as options\n",
	"# api = BigML(username, api_key)\n",
	"\n",
	"api = BigML()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Create a the diabetes source file by fetching the data from s3. Note that we wrap all these API calls with \n",
	"# api.check_resource so that each step runs synchronously\n",
	"\n",
	"diabetes_source = api.check_resource(\n",
	" api.create_source(\"s3://bigml-public/arff/diabetes.arff\", {\"name\": \"Diabetes\"}),\n",
	" api.get_source)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Create the dataset, but remove the \"weight\" field which is id 000009\n",
	"\n",
	"diabetes_dataset = api.check_resource(\n",
	" api.create_dataset(diabetes_source, {\"excluded_fields\": [ \"000009\" ] }),\n",
	" api.get_dataset)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Create the random sample by sampling over the entire dataset but using a range from 1 to 5\n",
	"\n",
	"diabetes_random_sample = api.check_resource(\n",
	" api.create_dataset(diabetes_dataset, {\"sample_rate\": 1, \"range\": [1,5], \"name\": \"Diabetes Random Sample\"}),\n",
	" api.get_dataset)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Now we create the model using the random sample\n",
	"\n",
	"diabetes_random_model = api.check_resource(\n",
	" api.create_model(diabetes_random_sample), \n",
	" api.get_model)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# And finally, evaluate the performance of the random sample model using the entire dataset\n",
	"\n",
	"diabetes_random_eval = api.check_resource(\n",
	" api.create_evaluation(diabetes_random_model, diabetes_dataset),\n",
	" api.get_evaluation)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Here we start the Active Learning demonstration. The first step is to cluster the instances in the diabetes dataset. \n",
	"# However, we have to ignore the class (which we want to predict) by scaling it to 0\n",
	"\n",
	"diabetes_cluster = api.check_resource(\n",
	" api.create_cluster(diabetes_dataset, { \"k\": 5, \"field_scales\": { \"class\": 0} }),\n",
	" api.get_cluster)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# We need to assign a centroid and distance for each datapoint in the dataset using the cluster. This generates \n",
	"# A CSV which we will fetch next. Note that we are keeping all fields in the CSV output, as well as the header and the\n",
	"# distance score. This will allow us to sample from this CSV to create a new source/dataset/model\n",
	"\n",
	"diabetes_batchcentroid = api.check_resource(\n",
	" api.create_batch_centroid(diabetes_cluster, diabetes_dataset, { \"all_fields\": True, \"header\": True, \"distance\": True, \"distance_name\": \"distance\" }),\n",
	" api.get_batch_centroid)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Fetch the centroid scores and use the csv module to parse them into list of dictionaries. This \n",
	"# dataset is small, so we are going to just do all the CSV steps in memory rather than write to disk \n",
	"# and worry about handling files\n",
	"\n",
	"centroid_scores = csv.DictReader(api.download_batch_centroid(diabetes_batchcentroid))\n",
	"centroid_samples = {}"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Now we examine each row in the CSV and build up a dictionary mapping the cluster name (Ex: \"Cluster 0\") to a \n",
	"# sampled row. The idea here is to find the row (which is an instance) that is closest to the centroid for each cluster. \n",
	"# All we need to do is compare the value in the \"distance\" field and keep the smallest.\n",
	"\n",
	"for row in centroid_scores:\n",
	" if not row['cluster'] in centroid_samples:\n",
	" centroid_samples[row['cluster']] = row\n",
	" else:\n",
	" if row[\"distance\"] < centroid_samples[row['cluster']][\"distance\"]:\n",
	" centroid_samples[row['cluster']] = row"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 10
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Rather than write to disk, we create an in-memory CSV using StringIO\n",
	"# We write a header to the CSV using the field names from the centroid_scores CSV we downloaded previously\n",
	"\n",
	"CSV = StringIO.StringIO()\n",
	"buffer = csv.DictWriter(CSV, centroid_scores.fieldnames)\n",
	"buffer.writeheader()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 11
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# For each of the rows with the minimum distance in centroid_samples, we right the rows to the in-memory CSV.\n",
	"# These will be our intelligently chosen samples.\n",
	"\n",
	"for cluster in centroid_samples:\n",
	" buffer.writerow(centroid_samples[cluster])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 12
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Now we repeat the source/dataset/model/eval steps for the cluster samples. \n",
	"# First step: create the source from the in-memory CSV\n",
	"\n",
	"diabetes_cluster_source = api.check_resource(\n",
	" api.create_source(CSV, { \"name\": \"Diabetes Cluster Sample\" }),\n",
	" api.get_source)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"
	]
	}
	],
	"prompt_number": 13
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Now create the dataset. We need to drop the cluster assignment and distance fields.\n",
	"\n",
	"diabetes_cluster_sample = api.check_resource(\n",
	" api.create_dataset(diabetes_cluster_source, {\"excluded_fields\": [ \"000009\", \"00000a\" ], \"name\": \"Diabetes Cluster Sample\" }),\n",
	" api.get_dataset)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 14
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Model the cluster sample dataset\n",
	"\n",
	"diabetes_cluster_model = api.check_resource(\n",
	" api.create_model(diabetes_cluster_sample), \n",
	" api.get_model)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 15
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Last step, evaluate this model against the original dataset\n",
	"\n",
	"diabetes_cluster_eval = api.check_resource(\n",
	" api.create_evaluation(diabetes_cluster_model, diabetes_dataset),\n",
	" api.get_evaluation)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 16
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Simple comparison of the accuracy. It's worth comparing these evaluations side-by-side in the bigml.com UI. \n",
	"# All 5 metrics should be significantly better. \n",
	"\n",
	"print \"Random Sample Accuracy: %s\" % diabetes_random_eval['object']['result']['model']['accuracy']\n",
	"print \"Cluster Sample Accuracy: %s\" % diabetes_cluster_eval['object']['result']['model']['accuracy']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Random Sample Accuracy: 0.65495\n",
	"Cluster Sample Accuracy: 0.73568\n"
	]
	}
	],
	"prompt_number": 17
	}
	],
	"metadata": {}
	}
	]
	}