Skip to content

Instantly share code, notes, and snippets.

@petersen-poul
Last active August 29, 2015 14:02
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save petersen-poul/f3d7bce160241f293501 to your computer and use it in GitHub Desktop.
A simplified example of Active Learning using clustering and decision tress for the diabetes dataset. This iPython notebook was used for a demonstration during the BigML Spring 2014 Webinar which can be viewed here: http://youtu.be/uG-vXFyCcms
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"This iPython Notebook uses the BigML API and the diabetes dataset to provide a simplified demonstration of Active Learning."
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"The goal is to create the best possible model for predicting diabetes using only 5 instances out of the 768 in the dataset. The first method is to try just a random sample of 5 instances. The second method is to use clustering to group the instances by similarities in the diagnostic measurements into 5 clusters, and then use the data point closet to each centroid as the sample points for modeling. For both cases, an evaluation against the entire dataset is performed as well."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import bigml, csv, StringIO\n",
"from bigml.api import BigML\n",
"\n",
"# You need to define BIGML_USERNAME and BIGML_API_KEY in your environment settings, or add them here as options\n",
"# api = BigML(username, api_key)\n",
"\n",
"api = BigML()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a the diabetes source file by fetching the data from s3. Note that we wrap all these API calls with \n",
"# api.check_resource so that each step runs synchronously\n",
"\n",
"diabetes_source = api.check_resource(\n",
" api.create_source(\"s3://bigml-public/arff/diabetes.arff\", {\"name\": \"Diabetes\"}),\n",
" api.get_source)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create the dataset, but remove the \"weight\" field which is id 000009\n",
"\n",
"diabetes_dataset = api.check_resource(\n",
" api.create_dataset(diabetes_source, {\"excluded_fields\": [ \"000009\" ] }),\n",
" api.get_dataset)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create the random sample by sampling over the entire dataset but using a range from 1 to 5\n",
"\n",
"diabetes_random_sample = api.check_resource(\n",
" api.create_dataset(diabetes_dataset, {\"sample_rate\": 1, \"range\": [1,5], \"name\": \"Diabetes Random Sample\"}),\n",
" api.get_dataset)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Now we create the model using the random sample\n",
"\n",
"diabetes_random_model = api.check_resource(\n",
" api.create_model(diabetes_random_sample), \n",
" api.get_model)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# And finally, evaluate the performance of the random sample model using the entire dataset\n",
"\n",
"diabetes_random_eval = api.check_resource(\n",
" api.create_evaluation(diabetes_random_model, diabetes_dataset),\n",
" api.get_evaluation)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Here we start the Active Learning demonstration. The first step is to cluster the instances in the diabetes dataset. \n",
"# However, we have to ignore the class (which we want to predict) by scaling it to 0\n",
"\n",
"diabetes_cluster = api.check_resource(\n",
" api.create_cluster(diabetes_dataset, { \"k\": 5, \"field_scales\": { \"class\": 0} }),\n",
" api.get_cluster)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# We need to assign a centroid and distance for each datapoint in the dataset using the cluster. This generates \n",
"# A CSV which we will fetch next. Note that we are keeping all fields in the CSV output, as well as the header and the\n",
"# distance score. This will allow us to sample from this CSV to create a new source/dataset/model\n",
"\n",
"diabetes_batchcentroid = api.check_resource(\n",
" api.create_batch_centroid(diabetes_cluster, diabetes_dataset, { \"all_fields\": True, \"header\": True, \"distance\": True, \"distance_name\": \"distance\" }),\n",
" api.get_batch_centroid)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Fetch the centroid scores and use the csv module to parse them into list of dictionaries. This \n",
"# dataset is small, so we are going to just do all the CSV steps in memory rather than write to disk \n",
"# and worry about handling files\n",
"\n",
"centroid_scores = csv.DictReader(api.download_batch_centroid(diabetes_batchcentroid))\n",
"centroid_samples = {}"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Now we examine each row in the CSV and build up a dictionary mapping the cluster name (Ex: \"Cluster 0\") to a \n",
"# sampled row. The idea here is to find the row (which is an instance) that is closest to the centroid for each cluster. \n",
"# All we need to do is compare the value in the \"distance\" field and keep the smallest.\n",
"\n",
"for row in centroid_scores:\n",
" if not row['cluster'] in centroid_samples:\n",
" centroid_samples[row['cluster']] = row\n",
" else:\n",
" if row[\"distance\"] < centroid_samples[row['cluster']][\"distance\"]:\n",
" centroid_samples[row['cluster']] = row"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Rather than write to disk, we create an in-memory CSV using StringIO\n",
"# We write a header to the CSV using the field names from the centroid_scores CSV we downloaded previously\n",
"\n",
"CSV = StringIO.StringIO()\n",
"buffer = csv.DictWriter(CSV, centroid_scores.fieldnames)\n",
"buffer.writeheader()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# For each of the rows with the minimum distance in centroid_samples, we right the rows to the in-memory CSV.\n",
"# These will be our intelligently chosen samples.\n",
"\n",
"for cluster in centroid_samples:\n",
" buffer.writerow(centroid_samples[cluster])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Now we repeat the source/dataset/model/eval steps for the cluster samples. \n",
"# First step: create the source from the in-memory CSV\n",
"\n",
"diabetes_cluster_source = api.check_resource(\n",
" api.create_source(CSV, { \"name\": \"Diabetes Cluster Sample\" }),\n",
" api.get_source)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Now create the dataset. We need to drop the cluster assignment and distance fields.\n",
"\n",
"diabetes_cluster_sample = api.check_resource(\n",
" api.create_dataset(diabetes_cluster_source, {\"excluded_fields\": [ \"000009\", \"00000a\" ], \"name\": \"Diabetes Cluster Sample\" }),\n",
" api.get_dataset)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Model the cluster sample dataset\n",
"\n",
"diabetes_cluster_model = api.check_resource(\n",
" api.create_model(diabetes_cluster_sample), \n",
" api.get_model)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Last step, evaluate this model against the original dataset\n",
"\n",
"diabetes_cluster_eval = api.check_resource(\n",
" api.create_evaluation(diabetes_cluster_model, diabetes_dataset),\n",
" api.get_evaluation)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Simple comparison of the accuracy. It's worth comparing these evaluations side-by-side in the bigml.com UI. \n",
"# All 5 metrics should be significantly better. \n",
"\n",
"print \"Random Sample Accuracy: %s\" % diabetes_random_eval['object']['result']['model']['accuracy']\n",
"print \"Cluster Sample Accuracy: %s\" % diabetes_cluster_eval['object']['result']['model']['accuracy']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Random Sample Accuracy: 0.65495\n",
"Cluster Sample Accuracy: 0.73568\n"
]
}
],
"prompt_number": 17
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment