Skip to content

Instantly share code, notes, and snippets.

@charleslparker
Last active June 2, 2016 16:12
Show Gist options
  • Save charleslparker/211cf12b588e2aa1b9e50593536b999d to your computer and use it in GitHub Desktop.
Save charleslparker/211cf12b588e2aa1b9e50593536b999d to your computer and use it in GitHub Desktop.
Custom Feature Analyzer
{
"name": "Custom feature analyzer",
"description": "Find the best features for modeling using a greedy algorithm",
"kind": "script",
"source_code": "analyze-features.whizzml",
"inputs": [
{
"name": "dataset-id",
"type": "dataset-id",
"description": "The data to select features from"
},
{
"name": "balance-objective",
"type": "boolean",
"default": true,
"description": "Set `balance_objective` during model building"
},
{
"name": "number-of-models",
"type": "number",
"default": 10,
"description": "Set `number_of_models` during model building"
},
{
"name": "node-threshold",
"type": "number",
"default": 512,
"description": "Set `node_threshold` during model building"
},
{
"name": "randomize",
"type": "boolean",
"default": true,
"description": "Set `randomize` during model building"
},
{
"name": "staleness",
"type": "number",
"default": 5,
"description": "Stop the algorithm after this many iterations without improvement"
},
{
"name": "pos-class",
"type": "string",
"default": "1",
"description": "Category name for the class considered positive"
},
{
"name": "neg-class",
"type": "string",
"default": "0",
"description": "Category name for the class considered negative"
},
{
"name": "recall-threshold",
"type": "number",
"default": 0.5,
"description": "Threshold of acceptable recall for the positive class"
}
],
"outputs": [
{
"name": "output-features",
"type": "list",
"description": "The list of selected fields"
}
]
}
;; Get an array of feature names from an array of ids
(define (feature-names dataset-id ids)
(let (fields (get (fetch dataset-id) "fields"))
(map (lambda (id) (get-in fields [id "name"])) ids)))
;; Split a dataset into "folds"
(define (create-k-folds dataset-id k-folds)
(let (k-fold-fn (lambda (x) (create-dataset
{"origin_dataset" dataset-id
"row_offset" x
"row_step" k-folds
"new_fields" [{"name" "k_fold"
"field" (str x)}]}))
dataset-ids (map k-fold-fn (range 0 k-folds)))
(wait* dataset-ids)))
;; Create a series of training/testing pairs by "holding out" one of
;; the given dataset ids as the testing set for each pair
(define (pair-k-folds dataset-ids)
(map (lambda(x) (list (nth dataset-ids x)
(concat (take x dataset-ids)
(drop (+ x 1) dataset-ids))))
(range 0 (count dataset-ids))))
;; Evaluate a list of ensemble parameters using cross validation
;; folds given in k-folds and the evaluation function given in
;; eval-parser. The latter should take an evaluation as input and
;; produce a single value (the quality of the solution) as output.
(define (evaluate-candidates param-maps k-folds eval-parser)
(let (train-sets (map (lambda (f) (nth f 1)) k-folds)
test-sets (map (lambda (f) (nth f 0)) k-folds)
_ (log-info "Creating models...")
reqs-fn (lambda (p)
(map (lambda (d) (merge p {"datasets" d})) train-sets))
mods (map (lambda (p) (create* "ensemble" (reqs-fn p))) param-maps)
mrows (map wait* mods)
_ (log-info "Modeling complete.")
_ (log-info "Creating evaluations...")
evals-fn (lambda (mrow)
(map (lambda (i) {"model" (nth mrow i)
"dataset" (nth test-sets i)})
(range (count mrow))))
evals (map (lambda (mrow) (create* "evaluation" (evals-fn mrow))) mrows)
erows (map wait* evals)
_ (log-info "Evaluation complete.")
_ (log-info "Combining evaluations...")
comb-evals (map (lambda (erow)
(create "evaluation" {"evaluations" erow}))
erows))
(map eval-parser (wait* comb-evals))))
;; Get the default set of input fields for this dataset (all preferred
;; fields minus the objective field).
(define (default-inputs dataset-id obj-id)
(let (fields-structure (get (fetch dataset-id) "fields")
fids (keys fields-structure)
field-val (lambda (fid k) (get-in fields-structure [fid k])))
(filter (lambda (k) (and (field-val k "preferred") (not (= obj-id k))))
fids)))
;; Get the given measure from the given class in the given evaluation
(define (get-measure eval class-name measure)
(let (per-class (get-in eval ["result" "model" "per_class_statistics"])
is-class? (lambda (s) (= (get s "class_name") class-name))
class-stats (filter is-class? per-class))
(when (empty? class-stats)
(raise {"message" (str "Class " class-name " not found!") "code" -200}))
(get (head class-stats) measure)))
;; Construct a measure function that takes an evaluation as input and outputs
;; a single value by which to judge the quality of solutions
(define (make-evaluator pos-class neg-class recall-threshold)
(lambda (eval-id)
(let (eval (fetch eval-id)
pos-rec (get-measure eval pos-class "average_recall"))
(if (> pos-rec recall-threshold)
(let (pos-pre (get-measure eval pos-class "average_precision")
neg-pre (get-measure eval neg-class "average_precision"))
(/ (- pos-pre (- 1 neg-pre)) (- 1 neg-pre)))
0))))
;; Make a list of models using the given dataset, objective field and
;; selected features. For each model, add one of the potential
;; features in potentials to the input feature list.
(define (make-and-eval selected potentials model-req k-folds evaluator)
(let (unselected (filter (lambda (f) (not (member? f selected))) potentials)
fsets (map (lambda (fid) (cons fid selected)) unselected)
make-req (lambda (fs) (assoc model-req "input_fields" fs))
requests (map make-req fsets)
values (evaluate-candidates requests k-folds evaluator))
(map (lambda (i) (list (nth fsets i) (nth values i)))
(range (count values)))))
;; Sort a list by a key function
(define (sort-by-fn fn xs)
(let (vals (map fn xs)
pairs (map (lambda (i) (list (nth vals i) i)) (range (count vals)))
spairs (reverse (sort pairs)))
(map (lambda (p) (nth xs (nth p 1))) spairs)))
;; Do best-first feature selection.
(define (select-features dataset-id
balance-objective
number-of-models
node-threshold
randomize
staleness
pos-class
neg-class
recall-threshold)
(let (k-folds (pair-k-folds (create-k-folds dataset-id 5))
first-test (head (head k-folds))
obj-id (dataset-get-objective-id first-test)
_ (log-info "Objective: " obj-id)
potentials (default-inputs first-test obj-id)
_ (log-info "Features: " potentials)
evaluator (make-evaluator pos-class neg-class recall-threshold)
mod-req {"seed" "features"
"balance_objective" balance-objective
"number_of_models" number-of-models
"node_threshold" node-threshold
"randomize" randomize})
(loop (to-evaluate [[[] 0]]
evaluated []
stale 0
last-best 0)
(if (< stale staleness)
(let (sorted-evals (sort-by-fn (lambda (x) (nth x 1)) to-evaluate)
next-eval (head sorted-evals)
_ (log-info "Evaluating: " (head next-eval))
rest-evals (tail sorted-evals)
new (make-and-eval (head next-eval)
potentials
mod-req
k-folds
evaluator)
next-to-evaluate (concat rest-evals new)
next-evaluated (cons next-eval evaluated)
all-evals (concat next-to-evaluate next-evaluated)
;; _ (log-info all-evals)
best-eval (head (sort-by-fn (lambda (x) (nth x 1)) all-evals))
best (max (cons last-best (map (lambda (x) (nth x 1)) new)))
_ (log-info "Current best score: " best)
_ (log-info "Current best features: "
(feature-names first-test (head best-eval)))
_ (log-info "Iterations without improvement: " stale))
(recur next-to-evaluate
next-evaluated
(if (> best (+ last-best 0.00001)) 0 (+ stale 1))
best))
(let (_ (log-info "Getting best...")
all-evals (concat to-evaluate evaluated)
sorted-evals (sort-by-fn (lambda (x) (nth x 1)) all-evals)
best-eval (head sorted-evals)
best-score (head (tail best-eval))
_ (log-info "Features: " (head best-eval) " Score: " best-score))
(when (<= best-score 0)
(log-warn "Best solution has poor recall!"))
(feature-names first-test (head best-eval)))))))
(define output-features (select-features dataset-id
balance-objective
number-of-models
node-threshold
randomize
staleness
pos-class
neg-class
recall-threshold))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment