-
-
Save mmerce/cd87dc119bfbf6dcc4ef0c7d9be0bf1d to your computer and use it in GitHub Desktop.
Script to weight using the cross-validation batch predictions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "Comparing model to batch-prediction-weighted model", | |
"description": "Comparing evaluation of the usual model with a model built using a weight field. The field is defined by using k-fold batch predictions to take into account which instances are predicted correctly and the confidence of this prediction. It also balances the imbalanced classes.", | |
"inputs": [ | |
{ | |
"name": "dataset-id", | |
"type": "dataset-id", | |
"description": "Select the dataset" | |
}, | |
{ | |
"name": "k-folds", | |
"type": "number", | |
"description": "number of folds for batch predictions", | |
"default": 5 | |
}, | |
{ | |
"name": "objective-name", | |
"type": "string", | |
"description": "Name of the field to be predicted", | |
"default": "" | |
}, | |
{ | |
"name": "model-options", | |
"type": "map", | |
"description": "Settings used to create the model", | |
"default": {} | |
}, | |
{ | |
"name": "batch-prediction-options", | |
"type": "map", | |
"description": "Settings used to create the batch predictions", | |
"default": {} | |
} | |
], | |
"outputs": [ | |
{ | |
"name": "output-eval", | |
"type": "map", | |
"description": "Evaluation measures for the reference model (no batch predictions used) and the weighted one." | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;Balancing with cost function example | |
;; | |
;;Builds a model for imbalanced datasets assigning a weight to each instance | |
;;to balance the classes and selectively give importance | |
;;to the instances that result in the model's better performance. | |
;; | |
;;The evaluation to know whether an instance will be weighted more or less | |
;;is done in a k-fold cross-validation way, but instead of aggregating the | |
;;results, each prediction is compared to the real value per instance and | |
;;an associated weight is defined depending on the result of this | |
;;comparison, like a cost function. | |
;; | |
;; Inputs: | |
;; dataset-id: (string) Dataset ID that contains the imbalanced data | |
;; k-folds: (integer) Number of parts that the evaluated intermediate models | |
;; will be divided into | |
;; objective-id: (string) ID of the field to be predicted | |
;; model-options: (map) Attributes that will be used in the models creation | |
;; calls | |
;; batch-prediction-options: (map) Attributes that will be used in the | |
;; batch prediction creation calls | |
;; | |
;; k-fold cross-validation code | |
;; This code will eventually be defined as a library. | |
(define MODEL_OPTIONS ["balance_objective" | |
"missing_splits" | |
"pruning" | |
"weight_field" | |
"objective_weights" | |
"node_threshold" | |
"seed"]) | |
(define ENSEMBLE_OPTIONS (concat MODEL_OPTIONS | |
["sample_rate" | |
"replacement" | |
"randomize" | |
"number_of_models"])) | |
(define BATCH_PREDICTION_OPTIONS ["sample_rate" | |
"out_of_bag" | |
"range" | |
"replacement" | |
"ordering" | |
"seed" | |
"missing_strategy" | |
"combiner"]) | |
;; batch datasets using k-folds | |
;; | |
;; creates k-fold batch datasets for a dataset | |
;; Inputs: | |
;; dataset-id: (string) Dataset ID | |
;; k-folds: (integer) Number of folds | |
;; model-options: (map) Options to use in model/ensemble | |
;; batch-prediction-options: (map) Options to use in batch predictions | |
;; creation | |
;; | |
;; Output: (list) batch predicted datasets | |
;; | |
;; Raises: | |
;; 101: The dataset-id argument is not a string | |
;; 102: The dataset-id is not a valid dataset ID | |
;; 103: The k-folds argument is not an integer | |
;; 104: The k-folds argument is not >= 2 | |
;; 105: The k-folds argument higher than the maximum | |
;; 106: The objective field ID is not in the selectable IDs list | |
;; 107: The k-folds argument is too high compared to the number of rows | |
;; | |
;; check-resource-id | |
;; | |
;; Validates that the argument is a resource ID and its type. Raises an error | |
;; if otherwise. | |
;; | |
;; Inputs: | |
;; resource-id: (string) Resource ID | |
;; type: (string) Type of resource | |
;; | |
;; Output: (string) Checked resource ID | |
(define (check-resource-id resource-id type) | |
(when (not (string? resource-id)) | |
(raise {"message" (str "Resource ID string expected. Found " | |
resource-id " instead.") | |
"code" 101})) | |
(when (not (= (resource-type resource-id) type)) | |
(raise {"message" (str "Failed to find a correct " type " ID.") | |
"code" 102})) | |
resource-id) | |
;; check-integer | |
;; | |
;; Validates that the argument is an integer. Raises error otherwise. | |
;; | |
;; Inputs: | |
;; value: (number) Integer to be checked | |
;; minimum: (number) Minimum value (false if not set) | |
;; maximum: (number) Maximum value (false if not set) | |
;; | |
;; Output: (number) Checked integer | |
(define (check-integer value minimum maximum) | |
(when (not (integer? value)) | |
(raise {"message" (str "Integer value expected. Found " value " instead.") | |
"code" 103})) | |
(when (and minimum (< value minimum)) | |
(raise {"message" (str "Minimum accepted value is " minimum ". " value | |
" found.") | |
"code" 104})) | |
(when (and maximum (> value maximum)) | |
(raise {"message" (str "Maximum accepted value is " maximum ". " value | |
" found.") | |
"code" 105})) | |
value) | |
;; check-k-folds-rows | |
;; | |
;; Validates that the number of rows in a dataset is at least twice the | |
;; number of k-folds. Raises error otherwise. | |
;; | |
;; Inputs: | |
;; k-folds: (number) Integer to be checked | |
;; dataset: (map) Dataset info | |
;; | |
(define (check-k-folds-rows k-folds dataset) | |
(when (> k-folds (/ (dataset "rows" 0) 2)) | |
(raise {"message" (str "The dataset has too few rows to be split in " | |
k-folds | |
" parts.") | |
"code" 107}))) | |
;; choosable-objective-ids | |
;; | |
;; List of IDs of the fields in the dataset that can be chosen as objective | |
;; field. | |
;; | |
;; Inputs: | |
;; fields: (map) Fields structure | |
;; Output: (list) list of field IDs | |
(define (choosable-objective-ids fields) | |
(let (field-val (lambda (fid k) (fields [fid k] false)) | |
objective-types ["categorical", "numeric"] | |
pref? (lambda (k) (field-val k "preferred")) | |
pred? (lambda (k) (member? (field-val k "optype") objective-types))) | |
(filter (lambda (x) (and (pref? x) (pred? x))) (keys fields)))) | |
;; check-dataset-objective-id | |
;; | |
;; Validates that the argument is a valid objective id in the reference | |
;; dataset. | |
;; | |
;; Inputs: | |
;; objective-id: (string) ID of the objective field | |
;; dataset: (map) Dataset resource information | |
;; | |
;; Output: (string) Checked objective field ID | |
(define (check-dataset-objective-id objective-id dataset) | |
(let (fields (dataset "fields" {}) | |
objective-ids (choosable-objective-ids fields)) | |
(when (not (member? objective-id objective-ids)) | |
(raise {"message" (str "Failed to find the objective ID in the dataset" | |
" choosable fields.") | |
"code" 106})))) | |
;; get-objective-name | |
;; | |
;; Returns the name of the field used as objective field | |
;; | |
;; Inputs: | |
;; dataset: (map) Dataset resource info | |
;; objective-id: (string) ID of the objective field | |
;; | |
;; Outputs: (string) Name of the objective field | |
(define (get-objective-name dataset objective-id) | |
(let (fields (dataset "fields" {})) | |
(fields [objective-id "name"] false))) | |
;; get-objective-id | |
;; | |
;; Returns the ID of the field used as objective field | |
;; | |
;; Inputs: | |
;; dataset: (map) Dataset resource info | |
;; objective-name: (string) Name of the objective field | |
;; | |
;; Outputs: (string) ID of the objective field | |
(define (get-objective-id dataset objective-name) | |
(let (fields (dataset "fields" {}) | |
objective-field (find-field fields objective-name)) | |
(if (not objective-field) | |
(raise {"message" (str "Failed to find the " | |
objective-name | |
" field" | |
" in this dataset.") | |
"code" 106}) | |
(objective-field "id" false)))) | |
;; create-k-folds | |
;; | |
;; creating k-fold splits from a dataset | |
;; | |
;; Inputs: | |
;; dataset-id: (string) Dataset ID | |
;; k-folds: (integer) Number of folds | |
;; | |
;; Output: (list) List of dataset IDs | |
;; | |
(define (create-k-folds dataset-id k-folds) | |
(let (k-fold-fn (lambda (x) | |
(create-dataset {"origin_dataset" dataset-id | |
"row_offset" x | |
"row_step" k-folds | |
"new_fields" [{"name" "k_fold" | |
"field" (str x)}]})) | |
dataset-ids (map k-fold-fn (range 0 k-folds))) | |
(wait* dataset-ids))) | |
;; pair-k-folds | |
;; | |
;; Builds a list of pairs of hold-out and complementary datasets for all | |
;; the k-fold dataset IDs. | |
;; | |
;; Inputs: | |
;; dataset-ids: (list) List of the k-fold dataset IDs | |
;; | |
;; Output: (list) List of pairs [hold-out dataset, multidataset with the rest] | |
;; | |
(define (pair-k-folds dataset-ids) | |
(map (lambda(x) | |
[(nth dataset-ids x) | |
(concat (take x dataset-ids) | |
(drop (+ x 1) dataset-ids))]) | |
(range 0 (count dataset-ids)))) | |
;; select-map-keys | |
;; | |
;; Filters the keys in a map, keeping only the ones that appear in the list. | |
;; | |
;; Inputs: | |
;; map: (map) Key, value maps | |
;; keys-list: (list) List of keys to be kept in the map | |
;; Output: (map) filtered map with only the keys in the keys-list | |
;; | |
(define (select-map-keys a-map keys-list) | |
(reduce (lambda (x y) (let (value (a-map y false)) | |
(cond value (assoc x y value) x))) | |
{} | |
keys-list)) | |
;; create-k-models | |
;; | |
;; Creates the models for a set of k-fold datasets | |
;; | |
;; Inputs: | |
;; type: (string) type of model (model or ensemble) | |
;; multidatasets: (list) List of lists of datset IDs once a k-fold is | |
;; excluded | |
;; objective-name: (string) name of the objective field | |
;; model-options: (map) Options for the model or ensemble | |
;; | |
;; Output: (list) model IDs | |
;; | |
(define (create-k-models type multidatasets objective-name model-options) | |
(let (models (map (lambda (x) | |
(create type | |
(merge {"datasets" x | |
"objective_field" objective-name} | |
model-options))) | |
multidatasets)) | |
(wait* models))) | |
;; end of k-fold cross-validation code | |
;; create-k-bp-datasets | |
;; | |
;; Creates the models/ensembles and batch predictions' datasets | |
;; for a set of k-fold datasets | |
;; | |
;; Inputs: | |
;; dataset-ids: (list) List of the k-fold dataset IDs | |
;; objective-name: (string) Objective field name | |
;; dataset-name: (string) Name of the origin dataset | |
;; model-options: (map) Options used to build the models/ensembles | |
;; batch-prediction-options: (map) Options used to build batch predictions | |
;; | |
;; Output: (list) List of dataset IDs | |
;; | |
(define (create-k-bp-datasets dataset-ids | |
objective-name | |
dataset-name | |
model-options | |
batch-prediction-options) | |
(let (number-of-models (model-options "number_of_models" 1) | |
k-fold-pairs (pair-k-folds dataset-ids) | |
options (if (> number-of-models 1) | |
(select-map-keys model-options ENSEMBLE_OPTIONS) | |
(select-map-keys model-options MODEL_OPTIONS)) | |
type (if (> number-of-models 1) "ensemble" "model") | |
multidatasets (map last k-fold-pairs) | |
batch-predictions-options (select-map-keys batch-prediction-options | |
BATCH_PREDICTION_OPTIONS) | |
models (create-k-models type | |
multidatasets | |
objective-name | |
options) | |
batch-predictions (iterate (es [] | |
id dataset-ids | |
mid models | |
idx (range 1 (+ 1 (count dataset-ids)))) | |
(let (name (str idx | |
"-fold batch-prediction " | |
dataset-name) | |
opts (assoc batch-prediction-options | |
"name" name | |
"all_fields" true | |
"output_dataset" true | |
"prediction_name" "__prediction__" | |
"confidence" true | |
"confidence_name" "__confidence__" | |
"tags" ["script_garbage"])) | |
(append es (create-batchprediction id | |
mid | |
opts)))) | |
batch-predictions (wait* batch-predictions)) | |
(wait* (for (bp-id batch-predictions) | |
(let (bp (fetch bp-id)) | |
(bp "output_dataset_resource")))))) | |
;; confidence-eval-weight | |
;; | |
;; Adds a weight field by using the following formula: | |
;; - when prediction is correct, the confidence is multiplied by the | |
;; inverse frequency of the class (total number of instances in the | |
;; dataset over the number of instances of the class) | |
;; - when prediction is not correct, the inverse of the confidence is | |
;; multiplied by the frequency of the class) | |
;; | |
;; Inputs: | |
;; dataset-id: (string) ID of the training dataset | |
;; objective-id: (string) Objective field ID | |
;; ds-ids: (list) List of the dataset IDs generated by the batch predictions | |
;; | |
;; Output: (list) List of dataset IDs | |
(define (confidence-eval-weight dataset-id objective-id ds-ids) | |
(let (dataset (fetch dataset-id) | |
distr (dataset ["fields" objective-id "summary" "categories"]) | |
total (apply + (for (item distr) (item 1))) | |
class-inst (for (item distr) (flatline " (list @{{item}})")) | |
class-inst (flatline "(real (head (tail (head (filter " | |
"(= (f {{objective-id}}) (nth _ 0)) " | |
"(list @{class-inst}))))))") | |
weight (flatline "(if (= (f {{objective-id}}) (f \"__prediction__\")) " | |
"(/ (* (f \"__confidence__\") {total}) " | |
"{class-inst}) " | |
"(* (/ 1 (* (f \"__confidence__\") {total})) " | |
"{class-inst}))")) | |
(for (ds-id ds-ids) | |
(create-dataset ds-id | |
{"new_fields" [{"field" weight | |
"name" "weight"}] | |
"tags" ["script_garbage"]})))) | |
;;k-fold-bp-w-model | |
;; | |
;;Creates the weighted model or ensemble from the original dataset | |
;;by doing a k-fold cross-validation and generating batch predictions | |
;;for every part of the dataset. The batch prediction results are used | |
;;to generate a weight per instance that will be one of the models | |
;;arguments. | |
;; | |
;; Inputs: | |
;; dataset-id: (string) ID of the training dataset | |
;; k-folds: (integer) Number of parts to use in the | |
;; objective-name: (string) Objective field ID | |
;; model-options: (map) Options used to build the models/ensembles | |
;; batch-prediction-options: (map) Options used to build batch predictions | |
;; | |
;; Output: (list) List of dataset IDs | |
(define (k-fold-bp-w-model dataset-id | |
k-folds | |
objective-name | |
model-options | |
batch-prediction-options) | |
(check-resource-id dataset-id "dataset") | |
(check-integer k-folds 2 false) | |
(let (dataset (fetch dataset-id) | |
dataset-name (dataset "name" false) | |
objective-id (get-objective-id dataset objective-name)) | |
(check-dataset-objective-id objective-id dataset) | |
(check-k-folds-rows k-folds dataset) | |
(let (k-fold-datasets (create-k-folds dataset-id k-folds) | |
ds-ids (create-k-bp-datasets k-fold-datasets | |
objective-name | |
dataset-name | |
model-options | |
batch-prediction-options) | |
ds-ids (confidence-eval-weight dataset-id objective-id ds-ids) | |
weighted-ds (create-dataset {"origin_datasets" ds-ids})) | |
(if (model-options "number_of_models" 1) > 1) | |
(create-ensemble {"dataset" weighted-ds | |
"weight_field" "weight" | |
"objective_field" objective-name | |
"excluded_fields" ["k_fold" | |
"__prediction__" | |
"__confidence__"]}) | |
(create-model {"dataset" weighted-ds | |
"weight_field" "weight" | |
"objective_field" objective-name | |
"excluded_fields" ["k_fold" | |
"__prediction__" | |
"__confidence__"]})))) | |
;;evaluate-weighted | |
;; | |
;;Main procedure that: | |
;; - splits the original data in training and test datasets | |
;; - creates a default model and evaluates it to use this as reference | |
;; - creates the weights to be associated to each instance in the training | |
;; dataset and generates a model using this weight field | |
;; - evaluates this weighted model | |
;; - builds a map with the basic evaluation measures | |
;; | |
;; Inputs: | |
;; dataset-id: (string) ID of the training dataset | |
;; k-folds: (integer) Number of parts to divide the training dataset to test | |
;; objective-name: (string) Objective field name | |
;; model-options: (map) Options used to build the models/ensembles | |
;; batch-prediction-options: (map) Options used to build batch predictions | |
;; | |
;; Output: (map) Basic evaluation metrics for the reference model and | |
;; the weighted one | |
(define (evaluate-weighted dataset-id | |
k-folds | |
objective-name | |
model-options | |
batch-prediction-options) | |
(let (seed (model-options "seed" "bigml") | |
[ds-train ds-test] (create-dataset-split dataset-id 0.8 seed) | |
model (create-model ds-train model-options) | |
eval-id (create-and-wait-evaluation model | |
ds-test) | |
weighted-model (k-fold-bp-w-model ds-train | |
k-folds | |
objective-name | |
model-options | |
batch-prediction-options) | |
weighted-eval-id (create-and-wait-evaluation weighted-model | |
ds-test) | |
weighted-eval (fetch weighted-eval-id) | |
eval (fetch eval-id)) | |
{"phi" (eval ["result" "model" "average_phi"]) | |
"accuracy" (eval ["result" "model" "accuracy"]) | |
"precision" (eval ["result" "model" "average_precision"]) | |
"recall" (eval ["result" "model" "average_recall"]) | |
"evaluation" eval-id | |
"weighted phi" (weighted-eval ["result" "model" "average_phi"]) | |
"weighted accuracy" (weighted-eval ["result" "model" "accuracy"]) | |
"weighted precision" (weighted-eval ["result" "model" "average_precision"]) | |
"weighted recall" (weighted-eval ["result" "model" "average_recall"]) | |
"weighted evaluation" weighted-eval-id})) | |
;;output-eval | |
;;output variable for the script. Contains the result of the evaluate-weighted | |
;;procedure | |
(define output-eval (evaluate-weighted dataset-id | |
k-folds | |
objective-name | |
model-options | |
batch-prediction-options)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment