whizzmler

## example.py
#!/usr/bin/env python
"""A simple WhizzML example

"""

from bigml.api import BigML

API = BigML()

LIBRARY = API.create_library("(define (addition a b) (+ a b))")

## find-neighbors.json
{
  "name": "Find neighbors",
  "description": "Find the closest cluster rows to a given one",
  "inputs": [
    {"name": "cluster-id", "type": "cluster-id", "description": "The cluster to select rows from"},
    {"name": "n", "type": "number", "description": "The number of points to return"},
    {"name": "instance", "type": "map", "description": "Base row to compute distances, as a map from field identifiers to values"}
  ],
  "outputs": [
    {"name": "rows", "type": "list", "description": "The list of the n closest rows to `base`"}

## readme.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                whizzmler
                / readme.md
            
            
              Last active
              May 10, 2016 21:22
            
          
    Stacked generalization

Objective: Improve predictions by modeling the output scores of multiple trained models.

Create a training and a holdout set
Create n different models on the training set (with some
difference among them; e.g., single-tree vs. ensemble vs. logistic
regression)
Make predictions from those models on the holdout set
Train a model to predict the class based on the other models' predictions


## predict-by-clusters.json
{"description": "A library with functions to make predictions from the results of a Model by Clusters execution",
 "name": "Predict by clusters"}

## model-by-clusters.json
{"description": "A script that generates a cluster and a set of models from its centroid datasets",
 "name": "Model by Clusters",
 "inputs": [{"name": "source-id", "type": "source-id"}],
 "outputs": [{"name": "dataset-id", "type": "dataset-id", "description": "Full dataset from input source"},
             {"name": "cluster-id", "type": "cluster-id", "description": "G-means cluster from full dataset"},
             {"name": "models", "type": "map", "description": "Map from centroid id to associated predictive model"},
             {"name": "names", "type": "map", "description": "Map from centroid id to centroid name"},
             {"name": "evaluations", "type": "map", "description": "Evaluations for each of the per centroid models"}]}

## normalize-dataset.json
{
  "name": "Normalize Dataset",
  "description": "Remove the top n anomalies from a dataset",
  "inputs": [
    {"name": "dataset-id", "type": "dataset-id", "description": "Dataset Id"},
    {"name": "top-n", "type": "number", "description": "Top N Anomalies to Remove"}
  ],
  "outputs": [
    {"name": "normalized-dataset", "type": "dataset-id", "description": "Normalized Dataset"}
  ],

## gradient-boosting.json
{
	"name": "Gradient Boosting",
	"description": "Perform gradient boosting for classification",
	"inputs": [{
		"name": "dataset-id",
		"type": "dataset-id",
		"description": "Select the training dataset"
	}],
	"outputs": [{
		"name": "model-array",

## best-first.whizzml
;; A simple function to get the max value in a list
(define (get-max xs) (reduce (lambda (x y) (if (> x y) x y)) (head xs) xs))

;; Get feature names given ids
(define (feature-names dataset-id ids)
  (let (fields (get (fetch dataset-id) "fields"))
    (map (lambda (id) (get-in fields [id "name"])) ids)))

;; Create a dataset sample
(define (sample-dataset ds-id rate oob)

## one-click-dataset.json
{
  "name": "One-Click Dataset",
  "description": "Create a new dataset from a source with a click",
  "outputs": [
    {"name": "dataset-id", "type": "dataset-id", "description": "The new dataset"},
    {"name": "rows", "type": "number", "description": "The number of rows of the new dataset"}
  ],
  "inputs": [
    {"name": "source-id", "type": "source-id", "default": "", "description": "Source from which to create a new dataset"},
    {"name": "source-name", "type": "string", "default": "", "description": "Name for the new dataset"}

## model-or-ensemble.json
{
    "name": "Model or ensemble",
    "description": "Select the best option for modeling a source: a model or an ensemble?",
    "inputs": [
        {
            "name": "input-source-id",
            "type": "source-id",
            "description": "Source for training/test the model and ensemble"
        }
    ],
	#!/usr/bin/env python
	"""A simple WhizzML example

	"""

	from bigml.api import BigML

	API = BigML()

	LIBRARY = API.create_library("(define (addition a b) (+ a b))")
	{
	"name": "Find neighbors",
	"description": "Find the closest cluster rows to a given one",
	"inputs": [
	{"name": "cluster-id", "type": "cluster-id", "description": "The cluster to select rows from"},
	{"name": "n", "type": "number", "description": "The number of points to return"},
	{"name": "instance", "type": "map", "description": "Base row to compute distances, as a map from field identifiers to values"}
	],
	"outputs": [
	{"name": "rows", "type": "list", "description": "The list of the n closest rows to `base`"}
	{"description": "A library with functions to make predictions from the results of a Model by Clusters execution",
	"name": "Predict by clusters"}
	{"description": "A script that generates a cluster and a set of models from its centroid datasets",
	"name": "Model by Clusters",
	"inputs": [{"name": "source-id", "type": "source-id"}],
	"outputs": [{"name": "dataset-id", "type": "dataset-id", "description": "Full dataset from input source"},
	{"name": "cluster-id", "type": "cluster-id", "description": "G-means cluster from full dataset"},
	{"name": "models", "type": "map", "description": "Map from centroid id to associated predictive model"},
	{"name": "names", "type": "map", "description": "Map from centroid id to centroid name"},
	{"name": "evaluations", "type": "map", "description": "Evaluations for each of the per centroid models"}]}
	{
	"name": "Normalize Dataset",
	"description": "Remove the top n anomalies from a dataset",
	"inputs": [
	{"name": "dataset-id", "type": "dataset-id", "description": "Dataset Id"},
	{"name": "top-n", "type": "number", "description": "Top N Anomalies to Remove"}
	],
	"outputs": [
	{"name": "normalized-dataset", "type": "dataset-id", "description": "Normalized Dataset"}
	],
	{
	"name": "Gradient Boosting",
	"description": "Perform gradient boosting for classification",
	"inputs": [{
	"name": "dataset-id",
	"type": "dataset-id",
	"description": "Select the training dataset"
	}],
	"outputs": [{
	"name": "model-array",
	;; A simple function to get the max value in a list
	(define (get-max xs) (reduce (lambda (x y) (if (> x y) x y)) (head xs) xs))

	;; Get feature names given ids
	(define (feature-names dataset-id ids)
	(let (fields (get (fetch dataset-id) "fields"))
	(map (lambda (id) (get-in fields [id "name"])) ids)))

	;; Create a dataset sample
	(define (sample-dataset ds-id rate oob)
	{
	"name": "One-Click Dataset",
	"description": "Create a new dataset from a source with a click",
	"outputs": [
	{"name": "dataset-id", "type": "dataset-id", "description": "The new dataset"},
	{"name": "rows", "type": "number", "description": "The number of rows of the new dataset"}
	],
	"inputs": [
	{"name": "source-id", "type": "source-id", "default": "", "description": "Source from which to create a new dataset"},
	{"name": "source-name", "type": "string", "default": "", "description": "Name for the new dataset"}
	{
	"name": "Model or ensemble",
	"description": "Select the best option for modeling a source: a model or an ensemble?",
	"inputs": [
	{
	"name": "input-source-id",
	"type": "source-id",
	"description": "Source for training/test the model and ensemble"
	}
	],