-
-
Save petersen-poul/9f23d3899e937bc102e4 to your computer and use it in GitHub Desktop.
ToyBoost algorithm and evaluation script from the 2014-03-11 BigML API Webinar (http://youtu.be/JtudB691AQY)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Copyright 2014 BigML, Inc | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you may | |
# not use this file except in compliance with the License. You may obtain | |
# a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
# Author: Poul E J Petersen / BigML | |
# | |
# This script provides a simple way to evaluate the ToyBoost algorithm | |
# by constructing a local ensemble and scoring the test set. | |
# Like ToyBoost, it was created for the purpose of demonstrating how | |
# the BigML API can be extended programmaticaly, and is only meant as an | |
# instructional tool. | |
import csv, sys, time | |
from bigml.api import BigML | |
from bigml.ensemble import Ensemble | |
# Define the path to the training file. Ideally this would be a script parameter. | |
test_file = "/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-test.csv" | |
api = BigML() | |
ensemble_ids = [] | |
while True: | |
# List the model_ids in reverse order (first model built first) | |
# This really should use tagging since right now it will include *ALL* | |
# models in the user account. | |
model_ids = [ el['resource'] for el in reversed(api.list_models()['objects']) ] | |
if ensemble_ids == model_ids: | |
time.sleep(3) | |
continue | |
# Build the list of ensembles up one model_id at a time so | |
# we can compare performance as each iteration adds a model | |
for model_id in model_ids[len(ensemble_ids):]: | |
ensemble_ids += [ model_id ] | |
print "...waiting on %s" % ensemble_ids[-1] | |
# Make sure the last model in the list is ready | |
api.ok(api.get_model(ensemble_ids[-1])) | |
# Build the ensemble locally | |
print "...building ensemble" | |
local_ensemble = Ensemble(ensemble_ids) | |
# Load the source as dict rows | |
print "...evaluating ensemble" | |
input_file = csv.DictReader(open(test_file)) | |
# Count errors in the ensemble prediction. Note that the name | |
# of the label "letter" is hard-coded. This really should be | |
# a script parameter, or should default to reading the last value | |
errors = 0 | |
for row in input_file: | |
if row['letter'] != local_ensemble.predict(row): | |
errors += 1 | |
print "ensemble with %d models -> #errors: %d, %%correct = %f" % (len(ensemble_ids), errors, float(4000-errors)/float(40)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Copyright 2014 BigML, Inc | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you may | |
# not use this file except in compliance with the License. You may obtain | |
# a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
# Author: Poul E J Petersen / BigML | |
# | |
# This ToyBoost algorithm was created for the purpose of demonstrating how | |
# the BigML API can be extended programmaticaly, and is only meant as an | |
# instructional tool. | |
# Define the path to the training file. Ideally this would be a script parameter. | |
# | |
# The dataset referenced here is from UCI Repository: | |
# http://archive.ics.uci.edu/ml/datasets/Letter+Recognition | |
# | |
# NOTE: For the webinar, the dataset was modified so that the letter field was | |
# the last column, and proper field titles were added. | |
# | |
# The webinar is available here: https://www.youtube.com/watch?v=lBq-h8k76EQ | |
source="/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-train.csv" | |
# Load the BigML module | |
from bigml.api import BigML | |
# Instantiate an API connection. BIGML_USERNAME and BIGML_API_KEY are read from the | |
# environment | |
api = BigML() | |
# Create a source from the local training file, and wait for it to complete | |
orig_source = api.create_source(source, { "name": "MrBoost - Original Source" }) | |
api.ok(orig_source) | |
# Create a dataset | |
orig_dataset = api.create_dataset(orig_source, { "name": "MrBoost - Original Dataset" }) | |
api.ok(orig_dataset) | |
# Assume the objective is that last preferred field in the dataset | |
objective_field_id = orig_dataset['object']['objective_field']['id'] | |
# Add the initial weights, all 1's, using a flatline json s-exp | |
trainset = api.create_dataset(orig_dataset, { | |
"new_fields": [ | |
{ | |
"field": '[ "if", ["=", 1, 1], 1, 1]', | |
"name": "weight" | |
}, | |
], | |
"name": "MrBoost - Train0" | |
}) | |
api.ok(trainset) | |
trainset = api.get_dataset(trainset) | |
# import Fields and use it to find the field_ids from field names | |
from bigml.fields import Fields | |
fields = Fields(trainset['object']['fields']) | |
# Find the field_id for weight by name | |
weight_field_id = fields.field_id("weight") | |
# The prediction field_id will be the next column after the weight_id | |
# so we just cast as an int, add 1, and convert back to a padded 6-digit | |
# hex number (the field_id format) | |
prediction_field_id = '{:06x}'.format(int(weight_field_id,16)+1) | |
# Each iteration starts with trainset and: | |
# builds a model using field weight as the weight | |
# scores the model with the trainset | |
# creates a remote source from the batch prediction | |
# creates a resultset from the new source | |
# creates a new trainset by recomputing the weights | |
# weight = *2 if prediction wrong, /2 if prediction right | |
# | |
# Final classifier is an ensemble of all trained models | |
for loop in range(0,10): | |
api.ok(trainset) | |
model = api.create_model(trainset, { | |
"name": "MrBoost - Model%d" % loop, | |
"objective_fields": [ objective_field_id ], | |
"excluded_fields": [ weight_field_id ], | |
"weight_field": weight_field_id | |
}) | |
api.ok(model) | |
batchp = api.create_batch_prediction(model, trainset, { | |
"name": "MrBoost - Result%d" % loop, | |
"all_fields": True, | |
"header": True, | |
"confidence": False, | |
"prediction_name": "prediction" | |
}) | |
api.ok(batchp) | |
# Create the new source by constructing a URL from the batch | |
# production download path and the authentication from the API module | |
source = api.create_source(batchp['location']+'/download'+api.__dict__['auth'], { | |
"name": "MrBoost - Source%d" % int(loop+1), | |
}) | |
api.ok(source) | |
resultset = api.create_dataset(source, { | |
"name": "MrBoost - Result%d" % int(loop+1), | |
}) | |
api.ok(resultset) | |
# Create the next training set by recomputing the weights and dropping the | |
# old weights and the prediction column | |
trainset = api.create_dataset(resultset, { | |
"new_fields": [ | |
{ | |
"field": '["if", ["=", ["f", "%s"], ["f", "prediction"]], ["/", [ "real", ["f", "weight"] ], 2], ["*", [ "real", ["f", "weight"]], 2]]' % ( objective_field_id ), | |
"name": "weight" | |
}, | |
], | |
"all_but": [ prediction_field_id, weight_field_id ], | |
"name": "MrBoost - Train%d" % int(loop+1) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment