Skip to content

Instantly share code, notes, and snippets.

@aficionado
Created October 9, 2016 17:02
Show Gist options
  • Save aficionado/7582bb59bc7050a348f4ae32798840a9 to your computer and use it in GitHub Desktop.
Save aficionado/7582bb59bc7050a348f4ae32798840a9 to your computer and use it in GitHub Desktop.
{
"name": "Tree optimization",
"description": "Script for tree optimization using SMACdown",
"kind": "script",
"source_code": "script.whizzml",
"inputs":[
{
"name": "dataset-id",
"type": "dataset-id",
"description": "Dataset for which we are seeking an optimal tree"
},
{
"name": "objective-id",
"type": "string",
"default": "default",
"description": "The tree's objective field, or 'default' to use the dataset's default"
},
{
"name": "metric",
"type": "string",
"default": "average_phi",
"description": "Evaluation metric that we want to optimize: one of average_recall, average_phi, accuracy, average_precision, or average_f_measure."
},
{
"name": "delete-resources",
"type": "boolean",
"default": false,
"description": "Whether to delete all intermediate resources"
}
],
"outputs":[
{
"name": "result",
"type": "list"
}
]
}
;; Here's a custom generator for creating BigML trees.
(define (smacdown-model-params-generator objective-type)
(lambda ()
(let (max-trees 127
max-nodes 1999
regression (= "numeric" objective-type))
{"stat_pruning" (if (< (rand) 0.5) false true)
"balance_objective" (if (or regression (< (rand) 0.5)) false true)
"node_threshold" (round (rand-range 4 max-nodes))})))
;; This function takes a training and test set (and an objective field
;; id) and evaluates a set of parameters by training a model with
;; those parameters and performing an evaluation on them. We decide
;; that phi is the metric we'd like to opimize, so we pull 1 - phi out
;; of each evaluation to return as the objective, as the algorithm
;; seeks to *minimize* a value and we want to *maximize* phi.
(define (smacdown-evaluator train test obj metric name)
(lambda (params itr)
(log-info "Evaluating " (count params) " candidates...")
(let (train-params {"dataset" train
"objective_field" obj
"seed" "SMACdown"
"name" (str name " smacdown itr " itr " test model")}
mod-fn (lambda (p) (merge p train-params))
eval-fn (lambda (m) {"model" m "dataset" test})
mod-ids (create* "model" (map mod-fn params))
eval-ids (create* "evaluation" (map eval-fn mod-ids))
phi (lambda (ev)
(let (metric-value (ev ["result" "model" metric] false))
(if (not (number? metric-value))
(raise {"message" (str metric " is not a valid metric!")
"code" -30})
(- 1 metric-value)))))
(log-info "Evaluation complete.")
(map (lambda (eid) (phi (fetch (wait eid)))) eval-ids))))
;; Find optimal parameters using SMACdown
(define (find-optimal-parameters train-params objective-id objective-type)
(let (test-params (assoc train-params "out_of_bag" true)
train-id (create-dataset train-params)
test-id (create-dataset test-params)
_ (wait* [train-id test-id])
eval-fn (smacdown-evaluator train-id
test-id
objective-id
metric
"smacdown-model")
generator (smacdown-model-params-generator objective-type)
output (smacdown-optimize generator eval-fn "smacdown-model"))
(for (p output)
(assoc (dissoc p smacdown--actual)
metric (- 1 (p smacdown--actual))))))
;; Delete resources ignoring errors
(define (safe-delete id)
(try (delete id)
(catch e (log-info (str "Error deleting resource " id " ignored")))))
;; Take a dataset, create a training and test set, and find the
;; optimal parameters. The function returns a list of parameters
;; ranked by objective.
(define (optimize-model dataset-id objective-id metric)
(let (train-params {"origin_dataset" dataset-id
"sample_rate" 0.8
"replacement" false
"seed" "SMACdown"}
test-params (assoc train-params "out_of_bag" true)
obj-id (if (= objective-id "default")
(dataset-get-objective-id dataset-id)
objective-id)
otype (or ((fetch dataset-id) ["fields" obj-id "optype"] false)
(raise {"message" (str "Invalid objective field")}))
params (find-optimal-parameters train-params obj-id otype)
_ (log-info "SMACdown search complete")
_ (when delete-resources
(log-info "Deleting intermediate resources...")
(map safe-delete (created-resources)))
_ (log-info "Training model on full dataset...")
mod-prms (merge ((head params) "parameters" {})
{"objective_field" obj-id "seed" "SMACdown"})
full-mod (create-model dataset-id mod-prms)
train-id (create-dataset train-params)
test-id (create-dataset test-params)
best-mod (create-model train-id mod-prms)
best-eval (create-evaluation best-mod test-id))
(wait* [best-eval full-mod])
(cons (assoc (head params)
"full_model" full-mod
"model" best-mod
"evaluation" best-eval)
(tail params))))
(define result (optimize-model dataset-id objective-id metric))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment