Skip to content

Instantly share code, notes, and snippets.

@mmerce
Last active September 12, 2018 13:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mmerce/cd87dc119bfbf6dcc4ef0c7d9be0bf1d to your computer and use it in GitHub Desktop.
Save mmerce/cd87dc119bfbf6dcc4ef0c7d9be0bf1d to your computer and use it in GitHub Desktop.
Script to weight using the cross-validation batch predictions
{
"name": "Comparing model to batch-prediction-weighted model",
"description": "Comparing evaluation of the usual model with a model built using a weight field. The field is defined by using k-fold batch predictions to take into account which instances are predicted correctly and the confidence of this prediction. It also balances the imbalanced classes.",
"inputs": [
{
"name": "dataset-id",
"type": "dataset-id",
"description": "Select the dataset"
},
{
"name": "k-folds",
"type": "number",
"description": "number of folds for batch predictions",
"default": 5
},
{
"name": "objective-name",
"type": "string",
"description": "Name of the field to be predicted",
"default": ""
},
{
"name": "model-options",
"type": "map",
"description": "Settings used to create the model",
"default": {}
},
{
"name": "batch-prediction-options",
"type": "map",
"description": "Settings used to create the batch predictions",
"default": {}
}
],
"outputs": [
{
"name": "output-eval",
"type": "map",
"description": "Evaluation measures for the reference model (no batch predictions used) and the weighted one."
}
]
}
;;Balancing with cost function example
;;
;;Builds a model for imbalanced datasets assigning a weight to each instance
;;to balance the classes and selectively give importance
;;to the instances that result in the model's better performance.
;;
;;The evaluation to know whether an instance will be weighted more or less
;;is done in a k-fold cross-validation way, but instead of aggregating the
;;results, each prediction is compared to the real value per instance and
;;an associated weight is defined depending on the result of this
;;comparison, like a cost function.
;;
;; Inputs:
;; dataset-id: (string) Dataset ID that contains the imbalanced data
;; k-folds: (integer) Number of parts that the evaluated intermediate models
;; will be divided into
;; objective-id: (string) ID of the field to be predicted
;; model-options: (map) Attributes that will be used in the models creation
;; calls
;; batch-prediction-options: (map) Attributes that will be used in the
;; batch prediction creation calls
;;
;; k-fold cross-validation code
;; This code will eventually be defined as a library.
(define MODEL_OPTIONS ["balance_objective"
"missing_splits"
"pruning"
"weight_field"
"objective_weights"
"node_threshold"
"seed"])
(define ENSEMBLE_OPTIONS (concat MODEL_OPTIONS
["sample_rate"
"replacement"
"randomize"
"number_of_models"]))
(define BATCH_PREDICTION_OPTIONS ["sample_rate"
"out_of_bag"
"range"
"replacement"
"ordering"
"seed"
"missing_strategy"
"combiner"])
;; batch datasets using k-folds
;;
;; creates k-fold batch datasets for a dataset
;; Inputs:
;; dataset-id: (string) Dataset ID
;; k-folds: (integer) Number of folds
;; model-options: (map) Options to use in model/ensemble
;; batch-prediction-options: (map) Options to use in batch predictions
;; creation
;;
;; Output: (list) batch predicted datasets
;;
;; Raises:
;; 101: The dataset-id argument is not a string
;; 102: The dataset-id is not a valid dataset ID
;; 103: The k-folds argument is not an integer
;; 104: The k-folds argument is not >= 2
;; 105: The k-folds argument higher than the maximum
;; 106: The objective field ID is not in the selectable IDs list
;; 107: The k-folds argument is too high compared to the number of rows
;;
;; check-resource-id
;;
;; Validates that the argument is a resource ID and its type. Raises an error
;; if otherwise.
;;
;; Inputs:
;; resource-id: (string) Resource ID
;; type: (string) Type of resource
;;
;; Output: (string) Checked resource ID
(define (check-resource-id resource-id type)
(when (not (string? resource-id))
(raise {"message" (str "Resource ID string expected. Found "
resource-id " instead.")
"code" 101}))
(when (not (= (resource-type resource-id) type))
(raise {"message" (str "Failed to find a correct " type " ID.")
"code" 102}))
resource-id)
;; check-integer
;;
;; Validates that the argument is an integer. Raises error otherwise.
;;
;; Inputs:
;; value: (number) Integer to be checked
;; minimum: (number) Minimum value (false if not set)
;; maximum: (number) Maximum value (false if not set)
;;
;; Output: (number) Checked integer
(define (check-integer value minimum maximum)
(when (not (integer? value))
(raise {"message" (str "Integer value expected. Found " value " instead.")
"code" 103}))
(when (and minimum (< value minimum))
(raise {"message" (str "Minimum accepted value is " minimum ". " value
" found.")
"code" 104}))
(when (and maximum (> value maximum))
(raise {"message" (str "Maximum accepted value is " maximum ". " value
" found.")
"code" 105}))
value)
;; check-k-folds-rows
;;
;; Validates that the number of rows in a dataset is at least twice the
;; number of k-folds. Raises error otherwise.
;;
;; Inputs:
;; k-folds: (number) Integer to be checked
;; dataset: (map) Dataset info
;;
(define (check-k-folds-rows k-folds dataset)
(when (> k-folds (/ (dataset "rows" 0) 2))
(raise {"message" (str "The dataset has too few rows to be split in "
k-folds
" parts.")
"code" 107})))
;; choosable-objective-ids
;;
;; List of IDs of the fields in the dataset that can be chosen as objective
;; field.
;;
;; Inputs:
;; fields: (map) Fields structure
;; Output: (list) list of field IDs
(define (choosable-objective-ids fields)
(let (field-val (lambda (fid k) (fields [fid k] false))
objective-types ["categorical", "numeric"]
pref? (lambda (k) (field-val k "preferred"))
pred? (lambda (k) (member? (field-val k "optype") objective-types)))
(filter (lambda (x) (and (pref? x) (pred? x))) (keys fields))))
;; check-dataset-objective-id
;;
;; Validates that the argument is a valid objective id in the reference
;; dataset.
;;
;; Inputs:
;; objective-id: (string) ID of the objective field
;; dataset: (map) Dataset resource information
;;
;; Output: (string) Checked objective field ID
(define (check-dataset-objective-id objective-id dataset)
(let (fields (dataset "fields" {})
objective-ids (choosable-objective-ids fields))
(when (not (member? objective-id objective-ids))
(raise {"message" (str "Failed to find the objective ID in the dataset"
" choosable fields.")
"code" 106}))))
;; get-objective-name
;;
;; Returns the name of the field used as objective field
;;
;; Inputs:
;; dataset: (map) Dataset resource info
;; objective-id: (string) ID of the objective field
;;
;; Outputs: (string) Name of the objective field
(define (get-objective-name dataset objective-id)
(let (fields (dataset "fields" {}))
(fields [objective-id "name"] false)))
;; get-objective-id
;;
;; Returns the ID of the field used as objective field
;;
;; Inputs:
;; dataset: (map) Dataset resource info
;; objective-name: (string) Name of the objective field
;;
;; Outputs: (string) ID of the objective field
(define (get-objective-id dataset objective-name)
(let (fields (dataset "fields" {})
objective-field (find-field fields objective-name))
(if (not objective-field)
(raise {"message" (str "Failed to find the "
objective-name
" field"
" in this dataset.")
"code" 106})
(objective-field "id" false))))
;; create-k-folds
;;
;; creating k-fold splits from a dataset
;;
;; Inputs:
;; dataset-id: (string) Dataset ID
;; k-folds: (integer) Number of folds
;;
;; Output: (list) List of dataset IDs
;;
(define (create-k-folds dataset-id k-folds)
(let (k-fold-fn (lambda (x)
(create-dataset {"origin_dataset" dataset-id
"row_offset" x
"row_step" k-folds
"new_fields" [{"name" "k_fold"
"field" (str x)}]}))
dataset-ids (map k-fold-fn (range 0 k-folds)))
(wait* dataset-ids)))
;; pair-k-folds
;;
;; Builds a list of pairs of hold-out and complementary datasets for all
;; the k-fold dataset IDs.
;;
;; Inputs:
;; dataset-ids: (list) List of the k-fold dataset IDs
;;
;; Output: (list) List of pairs [hold-out dataset, multidataset with the rest]
;;
(define (pair-k-folds dataset-ids)
(map (lambda(x)
[(nth dataset-ids x)
(concat (take x dataset-ids)
(drop (+ x 1) dataset-ids))])
(range 0 (count dataset-ids))))
;; select-map-keys
;;
;; Filters the keys in a map, keeping only the ones that appear in the list.
;;
;; Inputs:
;; map: (map) Key, value maps
;; keys-list: (list) List of keys to be kept in the map
;; Output: (map) filtered map with only the keys in the keys-list
;;
(define (select-map-keys a-map keys-list)
(reduce (lambda (x y) (let (value (a-map y false))
(cond value (assoc x y value) x)))
{}
keys-list))
;; create-k-models
;;
;; Creates the models for a set of k-fold datasets
;;
;; Inputs:
;; type: (string) type of model (model or ensemble)
;; multidatasets: (list) List of lists of datset IDs once a k-fold is
;; excluded
;; objective-name: (string) name of the objective field
;; model-options: (map) Options for the model or ensemble
;;
;; Output: (list) model IDs
;;
(define (create-k-models type multidatasets objective-name model-options)
(let (models (map (lambda (x)
(create type
(merge {"datasets" x
"objective_field" objective-name}
model-options)))
multidatasets))
(wait* models)))
;; end of k-fold cross-validation code
;; create-k-bp-datasets
;;
;; Creates the models/ensembles and batch predictions' datasets
;; for a set of k-fold datasets
;;
;; Inputs:
;; dataset-ids: (list) List of the k-fold dataset IDs
;; objective-name: (string) Objective field name
;; dataset-name: (string) Name of the origin dataset
;; model-options: (map) Options used to build the models/ensembles
;; batch-prediction-options: (map) Options used to build batch predictions
;;
;; Output: (list) List of dataset IDs
;;
(define (create-k-bp-datasets dataset-ids
objective-name
dataset-name
model-options
batch-prediction-options)
(let (number-of-models (model-options "number_of_models" 1)
k-fold-pairs (pair-k-folds dataset-ids)
options (if (> number-of-models 1)
(select-map-keys model-options ENSEMBLE_OPTIONS)
(select-map-keys model-options MODEL_OPTIONS))
type (if (> number-of-models 1) "ensemble" "model")
multidatasets (map last k-fold-pairs)
batch-predictions-options (select-map-keys batch-prediction-options
BATCH_PREDICTION_OPTIONS)
models (create-k-models type
multidatasets
objective-name
options)
batch-predictions (iterate (es []
id dataset-ids
mid models
idx (range 1 (+ 1 (count dataset-ids))))
(let (name (str idx
"-fold batch-prediction "
dataset-name)
opts (assoc batch-prediction-options
"name" name
"all_fields" true
"output_dataset" true
"prediction_name" "__prediction__"
"confidence" true
"confidence_name" "__confidence__"
"tags" ["script_garbage"]))
(append es (create-batchprediction id
mid
opts))))
batch-predictions (wait* batch-predictions))
(wait* (for (bp-id batch-predictions)
(let (bp (fetch bp-id))
(bp "output_dataset_resource"))))))
;; confidence-eval-weight
;;
;; Adds a weight field by using the following formula:
;; - when prediction is correct, the confidence is multiplied by the
;; inverse frequency of the class (total number of instances in the
;; dataset over the number of instances of the class)
;; - when prediction is not correct, the inverse of the confidence is
;; multiplied by the frequency of the class)
;;
;; Inputs:
;; dataset-id: (string) ID of the training dataset
;; objective-id: (string) Objective field ID
;; ds-ids: (list) List of the dataset IDs generated by the batch predictions
;;
;; Output: (list) List of dataset IDs
(define (confidence-eval-weight dataset-id objective-id ds-ids)
(let (dataset (fetch dataset-id)
distr (dataset ["fields" objective-id "summary" "categories"])
total (apply + (for (item distr) (item 1)))
class-inst (for (item distr) (flatline " (list @{{item}})"))
class-inst (flatline "(real (head (tail (head (filter "
"(= (f {{objective-id}}) (nth _ 0)) "
"(list @{class-inst}))))))")
weight (flatline "(if (= (f {{objective-id}}) (f \"__prediction__\")) "
"(/ (* (f \"__confidence__\") {total}) "
"{class-inst}) "
"(* (/ 1 (* (f \"__confidence__\") {total})) "
"{class-inst}))"))
(for (ds-id ds-ids)
(create-dataset ds-id
{"new_fields" [{"field" weight
"name" "weight"}]
"tags" ["script_garbage"]}))))
;;k-fold-bp-w-model
;;
;;Creates the weighted model or ensemble from the original dataset
;;by doing a k-fold cross-validation and generating batch predictions
;;for every part of the dataset. The batch prediction results are used
;;to generate a weight per instance that will be one of the models
;;arguments.
;;
;; Inputs:
;; dataset-id: (string) ID of the training dataset
;; k-folds: (integer) Number of parts to use in the
;; objective-name: (string) Objective field ID
;; model-options: (map) Options used to build the models/ensembles
;; batch-prediction-options: (map) Options used to build batch predictions
;;
;; Output: (list) List of dataset IDs
(define (k-fold-bp-w-model dataset-id
k-folds
objective-name
model-options
batch-prediction-options)
(check-resource-id dataset-id "dataset")
(check-integer k-folds 2 false)
(let (dataset (fetch dataset-id)
dataset-name (dataset "name" false)
objective-id (get-objective-id dataset objective-name))
(check-dataset-objective-id objective-id dataset)
(check-k-folds-rows k-folds dataset)
(let (k-fold-datasets (create-k-folds dataset-id k-folds)
ds-ids (create-k-bp-datasets k-fold-datasets
objective-name
dataset-name
model-options
batch-prediction-options)
ds-ids (confidence-eval-weight dataset-id objective-id ds-ids)
weighted-ds (create-dataset {"origin_datasets" ds-ids}))
(if (model-options "number_of_models" 1) > 1)
(create-ensemble {"dataset" weighted-ds
"weight_field" "weight"
"objective_field" objective-name
"excluded_fields" ["k_fold"
"__prediction__"
"__confidence__"]})
(create-model {"dataset" weighted-ds
"weight_field" "weight"
"objective_field" objective-name
"excluded_fields" ["k_fold"
"__prediction__"
"__confidence__"]}))))
;;evaluate-weighted
;;
;;Main procedure that:
;; - splits the original data in training and test datasets
;; - creates a default model and evaluates it to use this as reference
;; - creates the weights to be associated to each instance in the training
;; dataset and generates a model using this weight field
;; - evaluates this weighted model
;; - builds a map with the basic evaluation measures
;;
;; Inputs:
;; dataset-id: (string) ID of the training dataset
;; k-folds: (integer) Number of parts to divide the training dataset to test
;; objective-name: (string) Objective field name
;; model-options: (map) Options used to build the models/ensembles
;; batch-prediction-options: (map) Options used to build batch predictions
;;
;; Output: (map) Basic evaluation metrics for the reference model and
;; the weighted one
(define (evaluate-weighted dataset-id
k-folds
objective-name
model-options
batch-prediction-options)
(let (seed (model-options "seed" "bigml")
[ds-train ds-test] (create-dataset-split dataset-id 0.8 seed)
model (create-model ds-train model-options)
eval-id (create-and-wait-evaluation model
ds-test)
weighted-model (k-fold-bp-w-model ds-train
k-folds
objective-name
model-options
batch-prediction-options)
weighted-eval-id (create-and-wait-evaluation weighted-model
ds-test)
weighted-eval (fetch weighted-eval-id)
eval (fetch eval-id))
{"phi" (eval ["result" "model" "average_phi"])
"accuracy" (eval ["result" "model" "accuracy"])
"precision" (eval ["result" "model" "average_precision"])
"recall" (eval ["result" "model" "average_recall"])
"evaluation" eval-id
"weighted phi" (weighted-eval ["result" "model" "average_phi"])
"weighted accuracy" (weighted-eval ["result" "model" "accuracy"])
"weighted precision" (weighted-eval ["result" "model" "average_precision"])
"weighted recall" (weighted-eval ["result" "model" "average_recall"])
"weighted evaluation" weighted-eval-id}))
;;output-eval
;;output variable for the script. Contains the result of the evaluate-weighted
;;procedure
(define output-eval (evaluate-weighted dataset-id
k-folds
objective-name
model-options
batch-prediction-options))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment