Last active
March 18, 2017 20:35
-
-
Save petersen-poul/a855b5e4b94d6bdb15e6ad4b1de10236 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "Minimum Scale for Cluster Class Purity", | |
"description": "Given a dataset and a categorical field, finds the minimum scale required to create class purity in the cluster with k = number of classes.", | |
"inputs": [ | |
{ | |
"name": "dataset", | |
"type": "dataset-id", | |
"description": "Dataset to analyze." | |
}, | |
{ | |
"name": "fields", | |
"type": "list", | |
"description": "A list of field names or ids to analyze for minimum scale class purity. If emtpy, all categorical values will be analyzed.", | |
"default": [] | |
}, | |
{ | |
"name": "min-scale", | |
"type": "number", | |
"description": "Specifies the smallest scale value to check.", | |
"default": 0 | |
}, | |
{ | |
"name": "max-scale", | |
"type": "number", | |
"description": "Specifies the largest scale value to check.", | |
"default": 100 | |
}, | |
{ | |
"name": "min-accuracy", | |
"type": "number", | |
"description": "Each loop halves the range of possible scales. When the range is smaller than the accuracy, the search will stop.", | |
"default": 0.01 | |
}, | |
{ | |
"name": "max-loop", | |
"type": "number", | |
"description": "Stops after a fixed number of loops of searching for each categorical value.", | |
"default": 25 | |
} | |
], | |
"outputs": [ | |
{ | |
"name": "min-scales", | |
"type": "list", | |
"description": "A list containing the minimum scales, along with other details, for each categorical value tested for class purity." | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; | |
; Given a dataset map or resource id, returns a list of | |
; field ids for all categorical fields in input_fields | |
; | |
(define (all-categoricals dataset) | |
(let | |
(dataset-rec (if (map? dataset) dataset (fetch dataset))) | |
(loop | |
(ids (get dataset-rec "input_fields") categoricals []) | |
(if (= ids []) | |
categoricals | |
(let | |
(id (head ids)) | |
(if (= (get-in dataset-rec [ "fields" id "optype" ]) "categorical") | |
(recur (tail ids) (append categoricals id)) | |
(recur (tail ids) categoricals) | |
) | |
) | |
) | |
) | |
) | |
) | |
; | |
; Given a dataset and a field identifier returns | |
; a list of categoricals that match either the field name | |
; or id. Normally this is should be one item, but if a field | |
; happens to have a name that looks like a field id (ex: "000001") | |
; then there could be multiple matches. | |
; | |
(define (find-categorical dataset identifier) | |
(let | |
(dataset-rec (if (map? dataset) dataset (fetch dataset)) | |
field_ids (get dataset-rec "input_fields")) | |
(loop (ids field_ids result []) | |
(if (= ids []) | |
result | |
(let | |
(id (head ids) | |
field-rec (get-in dataset-rec ["fields" id])) | |
(if (and (or (= id identifier) (= (get field-rec "name") identifier)) (= (get field-rec "optype") "categorical")) | |
(recur (tail ids) (append result id)) | |
(recur (tail ids) result) | |
) | |
) | |
) | |
) | |
) | |
) | |
; | |
; Given a dataset and a list of identifiers, possibly | |
; empty, returns a list of all categoricals that match | |
; either by name or id. | |
; | |
(define (find-categoricals dataset identifiers) | |
(let | |
(dataset-rec (if (map? dataset) dataset (fetch dataset))) | |
(if (or (= identifiers []) (empty? identifiers)) | |
(all-categoricals dataset) | |
(loop (ids identifiers categoricals []) | |
(if (= ids []) | |
categoricals | |
(recur (tail ids) (concat categoricals (find-categorical dataset (head ids)))) | |
) | |
) | |
) | |
) | |
) | |
; | |
; Given a dataset and the id of a categorical value | |
; returns the number of instances in the dataset | |
; with the specified class. | |
; | |
(define (class-count dataset category-id class) | |
(let | |
(dataset-rec (if (map? dataset) dataset (fetch dataset)) | |
categories (get-in dataset-rec ["fields" category-id "summary" "categories"])) | |
(loop | |
(to-check categories) | |
(if (= to-check []) | |
0 | |
(let | |
(category (head to-check)) | |
(if (= (nth category 0) class) | |
(nth category 1) | |
(recur (tail to-check)) | |
) | |
) | |
) | |
) | |
) | |
) | |
; | |
; Given a cluster, computes the class purity of the specified | |
; categorical field between cluster groups. | |
; | |
; For example, if the categorical value is TRUE/FALSE then | |
; we loop over the cluster groups and for each check the centroid | |
; say it is TRUE, then count the number of instance in that | |
; custer group that are TRUE. | |
; | |
(define (class-purity cluster category-id) | |
(let | |
(cluster-rec (if (map? cluster) cluster (fetch cluster))) | |
(loop | |
(clusters (get-in cluster-rec ["clusters" "clusters"]) total-correct 0 total 0) | |
(if (= clusters []) | |
(/ total-correct total) | |
(let | |
(rec (head clusters) | |
cluster-dataset (create-and-wait-dataset { | |
"cluster" cluster | |
"centroid" (get rec "id")}) | |
cluster-class (get-in rec ["center" category-id]) | |
num-correct (class-count cluster-dataset category-id cluster-class)) | |
(recur (tail clusters) (+ total-correct num-correct) (+ total (get rec "count"))) | |
) | |
) | |
) | |
) | |
) | |
; | |
; Given a dataset, categorical id and a scale, creates | |
; a cluster with k = number of classes of the categorical | |
; field and with the appropriate scale. We then return the | |
; class purity of the cluster. | |
; | |
(define (purity-at-scale dataset category-id scale) | |
(let | |
(dataset-rec (if (map? dataset) dataset (fetch dataset)) | |
num-classes (count (get-in dataset-rec [ "fields" category-id "summary" "categories" ])) | |
field-scales (assoc {} category-id scale) | |
cluster (create-and-wait-cluster { | |
"dataset" dataset | |
"k" num-classes | |
"field_scales" field-scales})) | |
(log-info "Cluster: " cluster) | |
(class-purity cluster category-id) | |
) | |
) | |
; | |
; Uses a binary search between max/min possible values of scale | |
; to find the minimum scale required to ensure class purity in the cluster | |
; for the specified categorical field. | |
; | |
(define (purity-min-scale dataset category-id min-scale max-scale min-accuracy max-loop) | |
(log-info "Analyzing category-id " category-id) | |
(loop | |
(lower min-scale upper max-scale loop-count 0) | |
(log-info "Loop " loop-count " upper-scale=" upper " lower-scale=" lower) | |
(let | |
( diff (- upper lower) | |
mid-point (+ lower (/ diff 2)) | |
purity (purity-at-scale dataset category-id mid-point)) | |
(log-info "Scale " mid-point " Purity " purity) | |
(if (or (<= (/ diff 2) min-accuracy) (> loop-count max-loop)) | |
{"dataset" dataset | |
"category-id" category-id | |
"category-name" (get-in (fetch dataset) [ "fields" category-id "name" ]) | |
"min-scale" (if (= (floor purity) 1) mid-point upper) | |
"loop-count" loop-count | |
"stop-criteria" (if (> loop-count max-loop) "max-loop" "min-accuracy") | |
"accuracy" (/ diff 2)} | |
(if (= (floor purity) 1) | |
(recur lower mid-point (+ loop-count 1)) | |
(recur mid-point upper (+ loop-count 1)) | |
) | |
) | |
) | |
) | |
) | |
; | |
; Loops over the specified categorical fields and finds the minimum | |
; scale for class purity for each. | |
; | |
(define (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop) | |
(let | |
(categoricals | |
(if (or (empty? fields) (= fields [])) | |
(all-categoricals dataset) | |
(find-categoricals dataset fields) | |
) | |
) | |
(loop | |
(ids categoricals results []) | |
(if (= ids []) | |
results | |
(let | |
(id (head ids)) | |
(recur (tail ids) (append results (purity-min-scale dataset id min-scale max-scale min-accuracy max-loop))) | |
) | |
) | |
) | |
) | |
) | |
(define min-scales (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment