Skip to content

Instantly share code, notes, and snippets.

@whizzmler
Forked from charleslparker/best-first.whizzml
Created April 21, 2016 01:54
Show Gist options
  • Save whizzmler/dce8ca7b1b1a35baa78c3cea780729a3 to your computer and use it in GitHub Desktop.
Save whizzmler/dce8ca7b1b1a35baa78c3cea780729a3 to your computer and use it in GitHub Desktop.
Best-first feature selection via WhizzML
;; A simple function to get the max value in a list
(define (get-max xs) (reduce (lambda (x y) (if (> x y) x y)) (head xs) xs))
;; Get feature names given ids
(define (feature-names dataset-id ids)
(let (fields (get (fetch dataset-id) "fields"))
(map (lambda (id) (get-in fields [id "name"])) ids)))
;; Create a dataset sample
(define (sample-dataset ds-id rate oob)
(create-and-wait-dataset {"sample_rate" rate
"origin_dataset" ds-id
"out_of_bag" oob
"seed" "whizzml-example"}))
;; Split a dataset into training and test sets
(define (split-dataset ds-id rate)
(list (sample-dataset ds-id rate false)
(sample-dataset ds-id rate true)))
;; Get the default set of input fields for this dataset (all preferred
;; fields minus the objective field).
(define (default-inputs dataset-id obj-id)
(let (fields-structure (get (fetch dataset-id) "fields")
fids (keys fields-structure)
field-val (lambda (fid k) (get-in fields-structure [fid k])))
(filter (lambda (k) (and (field-val k "preferred") (not (= obj-id k))))
fids)))
;; Make a list of models using the given dataset, objective field and
;; selected features. For each model, add one of the potential
;; features in potentials to the input feature list.
(define (make-models dataset-id obj-field selected potentials)
(let (model-req {"dataset" dataset-id "objective_field" obj-field}
make-req (lambda (fid)
(assoc model-req "input_fields" (cons fid selected)))
all-reqs (map make-req potentials))
(create-and-wait* "model" all-reqs)))
;; Given a test dataset, a list of potential features, and a list of
;; model ids corresponding to those features, select the best
;; potential feature by performing an evaluation on each model and
;; returning the feature with the best performance.
(define (select-feature test-dataset-id potentials model-ids)
(let (eval-req {"dataset" test-dataset-id}
make-req (lambda (mid) (assoc eval-req "model" mid))
all-reqs (map make-req model-ids)
evs (map fetch (create-and-wait* "evaluation" all-reqs))
vs (map (lambda (ev) (get-in ev ["result" "model" "average_phi"])) evs)
value-map (make-map potentials vs)
max-val (get-max vs)
choose-best (lambda (id) (if (= max-val (get value-map id)) id false)))
(some choose-best potentials)))
;; Do best-first feature selection. Given a dataset and a target
;; number of features iteratively construct models for each feature,
;; evaluate them, and add the feature corresponding to the best
;; evaluation to the running set of features. Stop when you reach the
;; target number, or you run out of features.
(define (select-features dataset-id nfeatures)
(let (obj-id (dataset-get-objective-id dataset-id)
input-ids (default-inputs dataset-id obj-id)
splits (split-dataset dataset-id 0.5)
train-id (nth splits 0)
test-id (nth splits 1))
(loop (selected []
potentials input-ids)
(if (or (>= (count selected) nfeatures) (empty? potentials))
(feature-names dataset-id selected)
(let (_ (log-info "Making models...")
model-ids (make-models dataset-id obj-id selected potentials)
_ (log-info "Selecting feature...")
next-feat (select-feature test-id potentials model-ids)
_ (log-info "Selected feature is " next-feat))
(recur (cons next-feat selected)
(filter (lambda (id) (not (= id next-feat))) potentials)))))))
(define output-features (select-features dataset-id nfeatures))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment