Skip to content

Instantly share code, notes, and snippets.

@charleslparker
Last active May 25, 2016 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save charleslparker/d12a74d901c48f2ea158631196b9411f to your computer and use it in GitHub Desktop.
Save charleslparker/d12a74d901c48f2ea158631196b9411f to your computer and use it in GitHub Desktop.
Best-first feature selection via WhizzML
;; Get feature names given ids
(define (feature-names dataset-id ids)
(let (fields (get (fetch dataset-id) "fields"))
(map (lambda (id) (get-in fields [id "name"])) ids)))
(define (create-k-folds dataset-id k-folds)
(let (k-fold-fn (lambda (x) (create-dataset
{"origin_dataset" dataset-id
"row_offset" x
"row_step" k-folds
"new_fields" [{"name" "k_fold"
"field" (str x)}]}))
dataset-ids (map k-fold-fn (range 0 k-folds)))
(wait* dataset-ids)))
(define (pair-k-folds dataset-ids)
(map (lambda(x) (list (nth dataset-ids x)
(concat (take x dataset-ids)
(drop (+ x 1) dataset-ids))))
(range 0 (count dataset-ids))))
;; Get the default set of input fields for this dataset (all preferred
;; fields minus the objective field).
(define (default-inputs dataset-id obj-id)
(let (fields-structure (get (fetch dataset-id) "fields")
fids (keys fields-structure)
field-val (lambda (fid k) (get-in fields-structure [fid k])))
(filter (lambda (k) (and (field-val k "preferred") (not (= obj-id k))))
fids)))
;; Make a list of models using the given dataset, objective field and
;; selected features. For each model, add one of the potential
;; features in potentials to the input feature list.
(define (make-models dataset-id obj-field selected potentials)
(let (model-req {"dataset" dataset-id "objective_field" obj-field}
make-req (lambda (fid)
(assoc model-req "input_fields" (cons fid selected)))
all-reqs (map make-req potentials))
(create-and-wait* "model" all-reqs)))
;; Given a test dataset, a list of potential features, and a list of
;; model ids corresponding to those features, select the best
;; potential feature by performing an evaluation on each model and
;; returning the feature with the best performance.
(define (select-feature test-dataset-id potentials model-ids)
(let (eval-req {"dataset" test-dataset-id}
make-req (lambda (mid) (assoc eval-req "model" mid))
all-reqs (map make-req model-ids)
evs (map fetch (create-and-wait* "evaluation" all-reqs))
vs (map (lambda (ev) (get-in ev ["result" "model" "average_phi"])) evs)
value-map (make-map potentials vs)
max-val (max vs)
choose-best (lambda (id) (if (= max-val (get value-map id)) id false)))
(some choose-best potentials)))
;; Do best-first feature selection. Given a dataset and a target
;; number of features iteratively construct models for each feature,
;; evaluate them, and add the feature corresponding to the best
;; evaluation to the running set of features. Stop when you reach the
;; target number, or you run out of features.
(define (select-features dataset-id nfeatures)
(let (obj-id (dataset-get-objective-id dataset-id)
input-ids (default-inputs dataset-id obj-id)
splits (split-dataset dataset-id 0.5)
train-id (nth splits 0)
test-id (nth splits 1))
(loop (selected []
potentials input-ids)
(if (or (>= (count selected) nfeatures) (empty? potentials))
(feature-names dataset-id selected)
(let (_ (log-info "Making models...")
model-ids (make-models dataset-id obj-id selected potentials)
_ (log-info "Selecting feature...")
next-feat (select-feature test-id potentials model-ids)
_ (log-info "Selected feature is " next-feat))
(recur (cons next-feat selected)
(filter (lambda (id) (not (= id next-feat))) potentials)))))))
(define output-features (select-features dataset-id nfeatures))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment