Skip to content

Instantly share code, notes, and snippets.

@petersen-poul
Created March 12, 2014 03:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save petersen-poul/9f23d3899e937bc102e4 to your computer and use it in GitHub Desktop.
Save petersen-poul/9f23d3899e937bc102e4 to your computer and use it in GitHub Desktop.
ToyBoost algorithm and evaluation script from the 2014-03-11 BigML API Webinar (http://youtu.be/JtudB691AQY)
#!/usr/bin/env python
#
# Copyright 2014 BigML, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Author: Poul E J Petersen / BigML
#
# This script provides a simple way to evaluate the ToyBoost algorithm
# by constructing a local ensemble and scoring the test set.
# Like ToyBoost, it was created for the purpose of demonstrating how
# the BigML API can be extended programmaticaly, and is only meant as an
# instructional tool.
import csv, sys, time
from bigml.api import BigML
from bigml.ensemble import Ensemble
# Define the path to the training file. Ideally this would be a script parameter.
test_file = "/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-test.csv"
api = BigML()
ensemble_ids = []
while True:
# List the model_ids in reverse order (first model built first)
# This really should use tagging since right now it will include *ALL*
# models in the user account.
model_ids = [ el['resource'] for el in reversed(api.list_models()['objects']) ]
if ensemble_ids == model_ids:
time.sleep(3)
continue
# Build the list of ensembles up one model_id at a time so
# we can compare performance as each iteration adds a model
for model_id in model_ids[len(ensemble_ids):]:
ensemble_ids += [ model_id ]
print "...waiting on %s" % ensemble_ids[-1]
# Make sure the last model in the list is ready
api.ok(api.get_model(ensemble_ids[-1]))
# Build the ensemble locally
print "...building ensemble"
local_ensemble = Ensemble(ensemble_ids)
# Load the source as dict rows
print "...evaluating ensemble"
input_file = csv.DictReader(open(test_file))
# Count errors in the ensemble prediction. Note that the name
# of the label "letter" is hard-coded. This really should be
# a script parameter, or should default to reading the last value
errors = 0
for row in input_file:
if row['letter'] != local_ensemble.predict(row):
errors += 1
print "ensemble with %d models -> #errors: %d, %%correct = %f" % (len(ensemble_ids), errors, float(4000-errors)/float(40))
#!/usr/bin/env python
#
# Copyright 2014 BigML, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Author: Poul E J Petersen / BigML
#
# This ToyBoost algorithm was created for the purpose of demonstrating how
# the BigML API can be extended programmaticaly, and is only meant as an
# instructional tool.
# Define the path to the training file. Ideally this would be a script parameter.
#
# The dataset referenced here is from UCI Repository:
# http://archive.ics.uci.edu/ml/datasets/Letter+Recognition
#
# NOTE: For the webinar, the dataset was modified so that the letter field was
# the last column, and proper field titles were added.
#
# The webinar is available here: https://www.youtube.com/watch?v=lBq-h8k76EQ
source="/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-train.csv"
# Load the BigML module
from bigml.api import BigML
# Instantiate an API connection. BIGML_USERNAME and BIGML_API_KEY are read from the
# environment
api = BigML()
# Create a source from the local training file, and wait for it to complete
orig_source = api.create_source(source, { "name": "MrBoost - Original Source" })
api.ok(orig_source)
# Create a dataset
orig_dataset = api.create_dataset(orig_source, { "name": "MrBoost - Original Dataset" })
api.ok(orig_dataset)
# Assume the objective is that last preferred field in the dataset
objective_field_id = orig_dataset['object']['objective_field']['id']
# Add the initial weights, all 1's, using a flatline json s-exp
trainset = api.create_dataset(orig_dataset, {
"new_fields": [
{
"field": '[ "if", ["=", 1, 1], 1, 1]',
"name": "weight"
},
],
"name": "MrBoost - Train0"
})
api.ok(trainset)
trainset = api.get_dataset(trainset)
# import Fields and use it to find the field_ids from field names
from bigml.fields import Fields
fields = Fields(trainset['object']['fields'])
# Find the field_id for weight by name
weight_field_id = fields.field_id("weight")
# The prediction field_id will be the next column after the weight_id
# so we just cast as an int, add 1, and convert back to a padded 6-digit
# hex number (the field_id format)
prediction_field_id = '{:06x}'.format(int(weight_field_id,16)+1)
# Each iteration starts with trainset and:
# builds a model using field weight as the weight
# scores the model with the trainset
# creates a remote source from the batch prediction
# creates a resultset from the new source
# creates a new trainset by recomputing the weights
# weight = *2 if prediction wrong, /2 if prediction right
#
# Final classifier is an ensemble of all trained models
for loop in range(0,10):
api.ok(trainset)
model = api.create_model(trainset, {
"name": "MrBoost - Model%d" % loop,
"objective_fields": [ objective_field_id ],
"excluded_fields": [ weight_field_id ],
"weight_field": weight_field_id
})
api.ok(model)
batchp = api.create_batch_prediction(model, trainset, {
"name": "MrBoost - Result%d" % loop,
"all_fields": True,
"header": True,
"confidence": False,
"prediction_name": "prediction"
})
api.ok(batchp)
# Create the new source by constructing a URL from the batch
# production download path and the authentication from the API module
source = api.create_source(batchp['location']+'/download'+api.__dict__['auth'], {
"name": "MrBoost - Source%d" % int(loop+1),
})
api.ok(source)
resultset = api.create_dataset(source, {
"name": "MrBoost - Result%d" % int(loop+1),
})
api.ok(resultset)
# Create the next training set by recomputing the weights and dropping the
# old weights and the prediction column
trainset = api.create_dataset(resultset, {
"new_fields": [
{
"field": '["if", ["=", ["f", "%s"], ["f", "prediction"]], ["/", [ "real", ["f", "weight"] ], 2], ["*", [ "real", ["f", "weight"]], 2]]' % ( objective_field_id ),
"name": "weight"
},
],
"all_but": [ prediction_field_id, weight_field_id ],
"name": "MrBoost - Train%d" % int(loop+1)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment