petersen-poul/eval.py Secret

## eval.py
#!/usr/bin/env python
#
# Copyright 2014 BigML, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

# Author: Poul E J Petersen / BigML
#
# This script provides a simple way to evaluate the ToyBoost algorithm
# by constructing a local ensemble and scoring the test set.
# Like ToyBoost, it was created for the purpose of demonstrating how
# the BigML API can be extended programmaticaly, and is only meant as an
# instructional tool.

import csv, sys, time
from bigml.api import BigML
from bigml.ensemble import Ensemble

# Define the path to the training file. Ideally this would be a script parameter.

test_file = "/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-test.csv"

api = BigML()
ensemble_ids = []

while True:

    # List the model_ids in reverse order (first model built first)
    # This really should use tagging since right now it will include *ALL*
    # models in the user account.

    model_ids = [ el['resource'] for el in reversed(api.list_models()['objects']) ]

    if ensemble_ids == model_ids:
        time.sleep(3)
	continue

    # Build the list of ensembles up one model_id at a time so
    # we can compare performance as each iteration adds a model

    for model_id in model_ids[len(ensemble_ids):]:
        ensemble_ids += [ model_id ]
        print "...waiting on %s" % ensemble_ids[-1]

        # Make sure the last model in the list is ready

        api.ok(api.get_model(ensemble_ids[-1]))

        # Build the ensemble locally

        print "...building ensemble"
        local_ensemble = Ensemble(ensemble_ids)

        # Load the source as dict rows

        print "...evaluating ensemble"
        input_file = csv.DictReader(open(test_file))

        # Count errors in the ensemble prediction. Note that the name
        # of the label "letter" is hard-coded. This really should be
        # a script parameter, or should default to reading the last value

        errors = 0
        for row in input_file:
            if row['letter'] != local_ensemble.predict(row):
                errors += 1

        print "ensemble with %d models -> #errors: %d, %%correct = %f" % (len(ensemble_ids), errors, float(4000-errors)/float(40))

## toyboost.py
#!/usr/bin/env python
#
# Copyright 2014 BigML, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

# Author: Poul E J Petersen / BigML
#
# This ToyBoost algorithm was created for the purpose of demonstrating how
# the BigML API can be extended programmaticaly, and is only meant as an
# instructional tool.

# Define the path to the training file. Ideally this would be a script parameter.
#
# The dataset referenced here is from UCI Repository:
# http://archive.ics.uci.edu/ml/datasets/Letter+Recognition
#
# NOTE: For the webinar, the dataset was modified so that the letter field was
# the last column, and proper field titles were added.
#
# The webinar is available here: https://www.youtube.com/watch?v=lBq-h8k76EQ

source="/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-train.csv"

# Load the BigML module
from bigml.api import BigML

# Instantiate an API connection. BIGML_USERNAME and BIGML_API_KEY are read from the
# environment

api = BigML()

# Create a source from the local training file, and wait for it to complete

orig_source = api.create_source(source, { "name": "MrBoost - Original Source" })
api.ok(orig_source)

# Create a dataset

orig_dataset = api.create_dataset(orig_source, { "name": "MrBoost - Original Dataset" })
api.ok(orig_dataset)

# Assume the objective is that last preferred field in the dataset

objective_field_id = orig_dataset['object']['objective_field']['id']

# Add the initial weights, all 1's, using a flatline json s-exp

trainset = api.create_dataset(orig_dataset, {
    "new_fields": [
        {
            "field": '[ "if", ["=", 1, 1], 1, 1]',
            "name": "weight"
        },
    ],
    "name": "MrBoost - Train0"
})
api.ok(trainset)

trainset = api.get_dataset(trainset)

# import Fields and use it to find the field_ids from field names

from bigml.fields import Fields
fields = Fields(trainset['object']['fields'])

# Find the field_id for weight by name

weight_field_id = fields.field_id("weight")

# The prediction field_id will be the next column after the weight_id
# so we just cast as an int, add 1, and convert back to a padded 6-digit
# hex number (the field_id format)

prediction_field_id = '{:06x}'.format(int(weight_field_id,16)+1)

# Each iteration starts with trainset and:
#   builds a model using field weight as the weight
#   scores the model with the trainset
#   creates a remote source from the batch prediction
#   creates a resultset from the new source
#   creates a new trainset by recomputing the weights
#       weight = *2 if prediction wrong, /2 if prediction right
#
# Final classifier is an ensemble of all trained models

for loop in range(0,10):

    api.ok(trainset)

    model = api.create_model(trainset, {
        "name": "MrBoost - Model%d" % loop,
        "objective_fields": [ objective_field_id ],
        "excluded_fields": [ weight_field_id ],
        "weight_field": weight_field_id
    })
    api.ok(model)

    batchp = api.create_batch_prediction(model, trainset, {
        "name": "MrBoost - Result%d" % loop,
        "all_fields": True,
        "header": True,
        "confidence": False,
        "prediction_name": "prediction"
    })
    api.ok(batchp)

    # Create the new source by constructing a URL from the batch
    # production download path and the authentication from the API module

    source = api.create_source(batchp['location']+'/download'+api.__dict__['auth'], {
        "name": "MrBoost - Source%d" % int(loop+1),
    })
    api.ok(source)

    resultset = api.create_dataset(source, {
        "name": "MrBoost - Result%d" % int(loop+1),
    })
    api.ok(resultset)

    # Create the next training set by recomputing the weights and dropping the
    # old weights and the prediction column

    trainset = api.create_dataset(resultset, {
        "new_fields": [
            {
                "field": '["if", ["=", ["f", "%s"], ["f", "prediction"]], ["/", [ "real", ["f", "weight"] ], 2], ["*", [ "real", ["f", "weight"]], 2]]' % ( objective_field_id ),
                "name": "weight"
            },
        ],
        "all_but": [ prediction_field_id, weight_field_id ],
        "name": "MrBoost - Train%d" % int(loop+1)
    })
	#!/usr/bin/env python
	#
	# Copyright 2014 BigML, Inc
	#
	# Licensed under the Apache License, Version 2.0 (the "License"); you may
	# not use this file except in compliance with the License. You may obtain
	# a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	# License for the specific language governing permissions and limitations
	# under the License.

	# Author: Poul E J Petersen / BigML
	#
	# This script provides a simple way to evaluate the ToyBoost algorithm
	# by constructing a local ensemble and scoring the test set.
	# Like ToyBoost, it was created for the purpose of demonstrating how
	# the BigML API can be extended programmaticaly, and is only meant as an
	# instructional tool.

	import csv, sys, time
	from bigml.api import BigML
	from bigml.ensemble import Ensemble

	# Define the path to the training file. Ideally this would be a script parameter.

	test_file = "/Users/petersp/Desktop/Demo/ToyBoost/letter-recognition-test.csv"

	api = BigML()
	ensemble_ids = []

	while True:

	# List the model_ids in reverse order (first model built first)
	# This really should use tagging since right now it will include ALL
	# models in the user account.

	model_ids = [ el['resource'] for el in reversed(api.list_models()['objects']) ]

	if ensemble_ids == model_ids:
	time.sleep(3)
	continue

	# Build the list of ensembles up one model_id at a time so
	# we can compare performance as each iteration adds a model

	for model_id in model_ids[len(ensemble_ids):]:
	ensemble_ids += [ model_id ]
	print "...waiting on %s" % ensemble_ids[-1]

	# Make sure the last model in the list is ready

	api.ok(api.get_model(ensemble_ids[-1]))

	# Build the ensemble locally

	print "...building ensemble"
	local_ensemble = Ensemble(ensemble_ids)

	# Load the source as dict rows

	print "...evaluating ensemble"
	input_file = csv.DictReader(open(test_file))

	# Count errors in the ensemble prediction. Note that the name
	# of the label "letter" is hard-coded. This really should be
	# a script parameter, or should default to reading the last value

	errors = 0
	for row in input_file:
	if row['letter'] != local_ensemble.predict(row):
	errors += 1

	print "ensemble with %d models -> #errors: %d, %%correct = %f" % (len(ensemble_ids), errors, float(4000-errors)/float(40))