ToyBoost algorithm and evaluation script from the 2014-03-11 BigML API Webinar (http://youtu.be/JtudB691AQY)
#!/usr/bin/env python | |
# | |
# Copyright 2014 BigML, Inc | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you may | |
# not use this file except in compliance with the License. You may obtain | |
# a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
# Author: Poul E J Petersen / BigML | |
# | |
# This script provides a simple way to evaluate the ToyBoost algorithm | |
# by constructing a local ensemble and scoring the test set. | |
# Like ToyBoost, it was created for the purpose of demonstrating how | |
# the BigML API can be extended programmaticaly, and is only meant as an | |
# instructional tool. | |
import csv, sys, time | |
from bigml.api import BigML | |
from bigml.ensemble import Ensemble | |
# Define the path to the training file. Ideally this would be a script parameter. | |
test_file = "/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-test.csv" | |
api = BigML() | |
ensemble_ids = [] | |
while True: | |
# List the model_ids in reverse order (first model built first) | |
# This really should use tagging since right now it will include *ALL* | |
# models in the user account. | |
model_ids = [ el['resource'] for el in reversed(api.list_models()['objects']) ] | |
if ensemble_ids == model_ids: | |
time.sleep(3) | |
continue | |
# Build the list of ensembles up one model_id at a time so | |
# we can compare performance as each iteration adds a model | |
for model_id in model_ids[len(ensemble_ids):]: | |
ensemble_ids += [ model_id ] | |
print "...waiting on %s" % ensemble_ids[-1] | |
# Make sure the last model in the list is ready | |
api.ok(api.get_model(ensemble_ids[-1])) | |
# Build the ensemble locally | |
print "...building ensemble" | |
local_ensemble = Ensemble(ensemble_ids) | |
# Load the source as dict rows | |
print "...evaluating ensemble" | |
input_file = csv.DictReader(open(test_file)) | |
# Count errors in the ensemble prediction. Note that the name | |
# of the label "letter" is hard-coded. This really should be | |
# a script parameter, or should default to reading the last value | |
errors = 0 | |
for row in input_file: | |
if row['letter'] != local_ensemble.predict(row): | |
errors += 1 | |
print "ensemble with %d models -> #errors: %d, %%correct = %f" % (len(ensemble_ids), errors, float(4000-errors)/float(40)) |
#!/usr/bin/env python | |
# | |
# Copyright 2014 BigML, Inc | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you may | |
# not use this file except in compliance with the License. You may obtain | |
# a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
# Author: Poul E J Petersen / BigML | |
# | |
# This ToyBoost algorithm was created for the purpose of demonstrating how | |
# the BigML API can be extended programmaticaly, and is only meant as an | |
# instructional tool. | |
# Define the path to the training file. Ideally this would be a script parameter. | |
# | |
# The dataset referenced here is from UCI Repository: | |
# http://archive.ics.uci.edu/ml/datasets/Letter+Recognition | |
# | |
# NOTE: For the webinar, the dataset was modified so that the letter field was | |
# the last column, and proper field titles were added. | |
# | |
# The webinar is available here: https://www.youtube.com/watch?v=lBq-h8k76EQ | |
source="/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-train.csv" | |
# Load the BigML module | |
from bigml.api import BigML | |
# Instantiate an API connection. BIGML_USERNAME and BIGML_API_KEY are read from the | |
# environment | |
api = BigML() | |
# Create a source from the local training file, and wait for it to complete | |
orig_source = api.create_source(source, { "name": "MrBoost - Original Source" }) | |
api.ok(orig_source) | |
# Create a dataset | |
orig_dataset = api.create_dataset(orig_source, { "name": "MrBoost - Original Dataset" }) | |
api.ok(orig_dataset) | |
# Assume the objective is that last preferred field in the dataset | |
objective_field_id = orig_dataset['object']['objective_field']['id'] | |
# Add the initial weights, all 1's, using a flatline json s-exp | |
trainset = api.create_dataset(orig_dataset, { | |
"new_fields": [ | |
{ | |
"field": '[ "if", ["=", 1, 1], 1, 1]', | |
"name": "weight" | |
}, | |
], | |
"name": "MrBoost - Train0" | |
}) | |
api.ok(trainset) | |
trainset = api.get_dataset(trainset) | |
# import Fields and use it to find the field_ids from field names | |
from bigml.fields import Fields | |
fields = Fields(trainset['object']['fields']) | |
# Find the field_id for weight by name | |
weight_field_id = fields.field_id("weight") | |
# The prediction field_id will be the next column after the weight_id | |
# so we just cast as an int, add 1, and convert back to a padded 6-digit | |
# hex number (the field_id format) | |
prediction_field_id = '{:06x}'.format(int(weight_field_id,16)+1) | |
# Each iteration starts with trainset and: | |
# builds a model using field weight as the weight | |
# scores the model with the trainset | |
# creates a remote source from the batch prediction | |
# creates a resultset from the new source | |
# creates a new trainset by recomputing the weights | |
# weight = *2 if prediction wrong, /2 if prediction right | |
# | |
# Final classifier is an ensemble of all trained models | |
for loop in range(0,10): | |
api.ok(trainset) | |
model = api.create_model(trainset, { | |
"name": "MrBoost - Model%d" % loop, | |
"objective_fields": [ objective_field_id ], | |
"excluded_fields": [ weight_field_id ], | |
"weight_field": weight_field_id | |
}) | |
api.ok(model) | |
batchp = api.create_batch_prediction(model, trainset, { | |
"name": "MrBoost - Result%d" % loop, | |
"all_fields": True, | |
"header": True, | |
"confidence": False, | |
"prediction_name": "prediction" | |
}) | |
api.ok(batchp) | |
# Create the new source by constructing a URL from the batch | |
# production download path and the authentication from the API module | |
source = api.create_source(batchp['location']+'/download'+api.__dict__['auth'], { | |
"name": "MrBoost - Source%d" % int(loop+1), | |
}) | |
api.ok(source) | |
resultset = api.create_dataset(source, { | |
"name": "MrBoost - Result%d" % int(loop+1), | |
}) | |
api.ok(resultset) | |
# Create the next training set by recomputing the weights and dropping the | |
# old weights and the prediction column | |
trainset = api.create_dataset(resultset, { | |
"new_fields": [ | |
{ | |
"field": '["if", ["=", ["f", "%s"], ["f", "prediction"]], ["/", [ "real", ["f", "weight"] ], 2], ["*", [ "real", ["f", "weight"]], 2]]' % ( objective_field_id ), | |
"name": "weight" | |
}, | |
], | |
"all_but": [ prediction_field_id, weight_field_id ], | |
"name": "MrBoost - Train%d" % int(loop+1) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment