mmerce/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Python example: multiple local predictions

Using the python bindings and its local model object to predict from a CSV file you can create the predictions for test data stored in any local file. In the example, the CSV data is read from stdin and predictions are written in stdout, but
this can be easily changed to use any local file.
The command options available are:
-h, --help            show the help message and exit
--delimiter DELIMITER
CSV delimiter
--model MODEL_ID      model/id
--prediction-fields PREDICTION_FIELDS
Comma-separated list of input fields (predictors) to
be added to the prediction.
Usage:
cat diabetes.csv | python multiple_pred.py --model model/572b130a3bbd213099002274 --prediction-fields age,insulin  > predictions.csv
Notes:

The diabetes.csv used in the example can be found at s3://bigml-public/csv/diabetes.csv
BigML credentials are expected to be available through the environment variables, but can also be provided in the code as shown in the commented paragraph.

Requirements

The bigml module is needed. To install it you can use pip:
pip install bigml
The code has been tested in python 2.7 with bigml 4.6.1

  
## diabetes_sample.csv

          
            pregnancies
            plasma glucose
            blood pressure
            triceps skin thickness
            insulin
            bmi
            diabetes pedigree
            age
            diabetes

            
              6
              148
              72
              35
              0
              33.6
              0.627
              50
              true

            
              1
              85
              66
              29
              0
              26.6
              0.351
              31
              false

            
              8
              183
              64
              0
              0
              23.3
              0.672
              32
              true

            
              1
              89
              66
              23
              94
              28.1
              0.167
              21
              false

            
              0
              137
              40
              35
              168
              43.1
              2.288
              33
              true

## multiple_pred.py
##############################################################################
# Copyright (c) 2015-2016 BigML, Inc
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
##############################################################################

import csv
import sys
import StringIO
import argparse

from bigml.api import BigML
from bigml.model import Model
from bigml.fields import Fields


#!/usr/bin/env python
# -*- coding: utf-8 -*

"""Application Options: options published so that the user can change them
in the command line

"""

SUMMARY = ("Example of script which predicts locally. It receives data from"
           " stdin and writes predictions to stdout")

OPTIONS = {
    # model ID to be used
    '--model': {
        'required': 'True',
        'action': 'store',
        'dest': 'model_id',
        'help': "model/id"},

    # delimiter in the file
    '--delimiter': {
        'action': 'store',
        'dest': 'delimiter',
        'default': ',',
        'help': "CSV delimiter"},

    # Fields to be added to the prediction
    '--prediction-fields': {
        "action": 'store',
        "dest": 'prediction_fields',
        "help": ("Comma-separated list of input fields"
                 " (predictors) to be added to the prediction.")},
}


def parser_add_options(parser, options):
    """Adds the options to the parser

    """
    for option, properties in sorted(options.items(), key=lambda x: x[0]):
        parser.add_argument(option, **properties)


def create_parser(user_options):
    """Parses the user-given parameters.

    """
    parser = argparse.ArgumentParser(
        description=SUMMARY,
        epilog="BigML, Inc")

    parser_add_options(parser, user_options)
    return parser


def main(args=sys.argv[1:]):
    """Parses command-line parameters and calls the actual main function.

    """

    # If credentials are properly set in environment variables, there's no need
    # to explicitly create the api object. Otherwise, use next code to set them:
    # api = BigML("username", "api-key")
    # local_model = Model('model/53c83a8f48d9b6322800007d', api=api)

    command_args = create_parser(OPTIONS).parse_args(args)
    # transforming args object to dictionary
    context = vars(command_args)

    # Use the user-given local model
    local_model = Model(context['model_id'])

    # Read from stdin
    input_stream = StringIO.StringIO(sys.stdin.read())

    # Read the CSV as a Dictionary assuming first line has headers
    reader = csv.DictReader(input_stream, delimiter=context['delimiter'])

    # List of fields to be added to the prediction
    prediction_fields = []
    if context['prediction_fields']:
        prediction_fields = context['prediction_fields'].split(",")

    fieldnames = prediction_fields[:]
    # retrieving the classes available for the prediction
    categories = [element[0] for element in local_model.fields[ \
        local_model.objective_id]['summary']['categories']]

    for category in categories:
        fieldnames.extend(['%s confidence' % category,
                           '%s probability' % category])

    # We will write to stdout, but can write to any file-like object
    output_stream = sys.stdout
    writer = csv.DictWriter(output_stream, fieldnames=fieldnames)
    writer.writeheader()
    # Predicting

    # Settings for predictions format
    kwargs = {"add_confidence": True, "multiple": "all"}
    # Predictions
    for input_data in reader:
        predictions = local_model.predict(input_data, **kwargs)
        output = {}
        for field in prediction_fields:
            if field in input_data:
                output.update({field: input_data[field]})
            else:
                output.update({field: "-"})
        for prediction in predictions:
            output.update({("%s confidence" % prediction["prediction"]): \
                prediction['confidence']})
            output.update({("%s probability" % prediction["prediction"]): \
                prediction['probability']})
        for category in categories:
            confidence = "%s confidence" % category
            if confidence not in output:
                output.update({("%s confidence" % category): 0})
                output.update({("%s probability" % category): 0})
        writer.writerow(output)


if __name__ == "__main__":
    main()

## predictions.csv

          
            age
            insulin
            false confidence
            false probability
            true confidence
            true probability

            
              50
              0
              0
              0
              0.20654329147389294
              1.0

            
              31
              0
              0.886482908609522
              1.0
              0
              0

            
              32
              0
              0
              0
              0.7411599827511859
              1.0

            
              21
              94
              0.9650260010779157
              1.0
              0
              0

            
              33
              168
              0
              0
              0.7846829880728186
              1.0
pregnancies	plasma glucose	blood pressure	triceps skin thickness	insulin	bmi	diabetes pedigree	age	diabetes
6	148	72	35	0	33.6	0.627	50	true
1	85	66	29	0	26.6	0.351	31	false
8	183	64	0	0	23.3	0.672	32	true
1	89	66	23	94	28.1	0.167	21	false
0	137	40	35	168	43.1	2.288	33	true
	##############################################################################
	# Copyright (c) 2015-2016 BigML, Inc
	#
	# Permission is hereby granted, free of charge, to any person obtaining
	# a copy of this software and associated documentation files (the
	# "Software"), to deal in the Software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish,
	# distribute, sublicense, and/or sell copies of the Software, and to
	# permit persons to whom the Software is furnished to do so, subject to
	# the following conditions:
	#
	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
	# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
	# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
	# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	##############################################################################

	import csv
	import sys
	import StringIO
	import argparse

	from bigml.api import BigML
	from bigml.model import Model
	from bigml.fields import Fields



	#!/usr/bin/env python
	# -- coding: utf-8 -

	"""Application Options: options published so that the user can change them
	in the command line

	"""

	SUMMARY = ("Example of script which predicts locally. It receives data from"
	" stdin and writes predictions to stdout")

	OPTIONS = {
	# model ID to be used
	'--model': {
	'required': 'True',
	'action': 'store',
	'dest': 'model_id',
	'help': "model/id"},

	# delimiter in the file
	'--delimiter': {
	'action': 'store',
	'dest': 'delimiter',
	'default': ',',
	'help': "CSV delimiter"},

	# Fields to be added to the prediction
	'--prediction-fields': {
	"action": 'store',
	"dest": 'prediction_fields',
	"help": ("Comma-separated list of input fields"
	" (predictors) to be added to the prediction.")},
	}


	def parser_add_options(parser, options):
	"""Adds the options to the parser

	"""
	for option, properties in sorted(options.items(), key=lambda x: x[0]):
	parser.add_argument(option, **properties)


	def create_parser(user_options):
	"""Parses the user-given parameters.

	"""
	parser = argparse.ArgumentParser(
	description=SUMMARY,
	epilog="BigML, Inc")

	parser_add_options(parser, user_options)
	return parser


	def main(args=sys.argv[1:]):
	"""Parses command-line parameters and calls the actual main function.

	"""

	# If credentials are properly set in environment variables, there's no need
	# to explicitly create the api object. Otherwise, use next code to set them:
	# api = BigML("username", "api-key")
	# local_model = Model('model/53c83a8f48d9b6322800007d', api=api)

	command_args = create_parser(OPTIONS).parse_args(args)
	# transforming args object to dictionary
	context = vars(command_args)

	# Use the user-given local model
	local_model = Model(context['model_id'])

	# Read from stdin
	input_stream = StringIO.StringIO(sys.stdin.read())

	# Read the CSV as a Dictionary assuming first line has headers
	reader = csv.DictReader(input_stream, delimiter=context['delimiter'])

	# List of fields to be added to the prediction
	prediction_fields = []
	if context['prediction_fields']:
	prediction_fields = context['prediction_fields'].split(",")

	fieldnames = prediction_fields[:]
	# retrieving the classes available for the prediction
	categories = [element[0] for element in local_model.fields[ \
	local_model.objective_id]['summary']['categories']]

	for category in categories:
	fieldnames.extend(['%s confidence' % category,
	'%s probability' % category])

	# We will write to stdout, but can write to any file-like object
	output_stream = sys.stdout
	writer = csv.DictWriter(output_stream, fieldnames=fieldnames)
	writer.writeheader()
	# Predicting

	# Settings for predictions format
	kwargs = {"add_confidence": True, "multiple": "all"}
	# Predictions
	for input_data in reader:
	predictions = local_model.predict(input_data, **kwargs)
	output = {}
	for field in prediction_fields:
	if field in input_data:
	output.update({field: input_data[field]})
	else:
	output.update({field: "-"})
	for prediction in predictions:
	output.update({("%s confidence" % prediction["prediction"]): \
	prediction['confidence']})
	output.update({("%s probability" % prediction["prediction"]): \
	prediction['probability']})
	for category in categories:
	confidence = "%s confidence" % category
	if confidence not in output:
	output.update({("%s confidence" % category): 0})
	output.update({("%s probability" % category): 0})
	writer.writerow(output)


	if __name__ == "__main__":
	main()
age	insulin	false confidence	false probability	true confidence	true probability
50	0	0	0	0.20654329147389294	1.0
31	0	0.886482908609522	1.0	0	0
32	0	0	0	0.7411599827511859	1.0
21	94	0.9650260010779157	1.0	0	0
33	168	0	0	0.7846829880728186	1.0