Skip to content

Instantly share code, notes, and snippets.

@petersen-poul
Last active March 18, 2017 20:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save petersen-poul/a855b5e4b94d6bdb15e6ad4b1de10236 to your computer and use it in GitHub Desktop.
Save petersen-poul/a855b5e4b94d6bdb15e6ad4b1de10236 to your computer and use it in GitHub Desktop.
{
"name": "Minimum Scale for Cluster Class Purity",
"description": "Given a dataset and a categorical field, finds the minimum scale required to create class purity in the cluster with k = number of classes.",
"inputs": [
{
"name": "dataset",
"type": "dataset-id",
"description": "Dataset to analyze."
},
{
"name": "fields",
"type": "list",
"description": "A list of field names or ids to analyze for minimum scale class purity. If emtpy, all categorical values will be analyzed.",
"default": []
},
{
"name": "min-scale",
"type": "number",
"description": "Specifies the smallest scale value to check.",
"default": 0
},
{
"name": "max-scale",
"type": "number",
"description": "Specifies the largest scale value to check.",
"default": 100
},
{
"name": "min-accuracy",
"type": "number",
"description": "Each loop halves the range of possible scales. When the range is smaller than the accuracy, the search will stop.",
"default": 0.01
},
{
"name": "max-loop",
"type": "number",
"description": "Stops after a fixed number of loops of searching for each categorical value.",
"default": 25
}
],
"outputs": [
{
"name": "min-scales",
"type": "list",
"description": "A list containing the minimum scales, along with other details, for each categorical value tested for class purity."
}
]
}
;
; Given a dataset map or resource id, returns a list of
; field ids for all categorical fields in input_fields
;
(define (all-categoricals dataset)
(let
(dataset-rec (if (map? dataset) dataset (fetch dataset)))
(loop
(ids (get dataset-rec "input_fields") categoricals [])
(if (= ids [])
categoricals
(let
(id (head ids))
(if (= (get-in dataset-rec [ "fields" id "optype" ]) "categorical")
(recur (tail ids) (append categoricals id))
(recur (tail ids) categoricals)
)
)
)
)
)
)
;
; Given a dataset and a field identifier returns
; a list of categoricals that match either the field name
; or id. Normally this is should be one item, but if a field
; happens to have a name that looks like a field id (ex: "000001")
; then there could be multiple matches.
;
(define (find-categorical dataset identifier)
(let
(dataset-rec (if (map? dataset) dataset (fetch dataset))
field_ids (get dataset-rec "input_fields"))
(loop (ids field_ids result [])
(if (= ids [])
result
(let
(id (head ids)
field-rec (get-in dataset-rec ["fields" id]))
(if (and (or (= id identifier) (= (get field-rec "name") identifier)) (= (get field-rec "optype") "categorical"))
(recur (tail ids) (append result id))
(recur (tail ids) result)
)
)
)
)
)
)
;
; Given a dataset and a list of identifiers, possibly
; empty, returns a list of all categoricals that match
; either by name or id.
;
(define (find-categoricals dataset identifiers)
(let
(dataset-rec (if (map? dataset) dataset (fetch dataset)))
(if (or (= identifiers []) (empty? identifiers))
(all-categoricals dataset)
(loop (ids identifiers categoricals [])
(if (= ids [])
categoricals
(recur (tail ids) (concat categoricals (find-categorical dataset (head ids))))
)
)
)
)
)
;
; Given a dataset and the id of a categorical value
; returns the number of instances in the dataset
; with the specified class.
;
(define (class-count dataset category-id class)
(let
(dataset-rec (if (map? dataset) dataset (fetch dataset))
categories (get-in dataset-rec ["fields" category-id "summary" "categories"]))
(loop
(to-check categories)
(if (= to-check [])
0
(let
(category (head to-check))
(if (= (nth category 0) class)
(nth category 1)
(recur (tail to-check))
)
)
)
)
)
)
;
; Given a cluster, computes the class purity of the specified
; categorical field between cluster groups.
;
; For example, if the categorical value is TRUE/FALSE then
; we loop over the cluster groups and for each check the centroid
; say it is TRUE, then count the number of instance in that
; custer group that are TRUE.
;
(define (class-purity cluster category-id)
(let
(cluster-rec (if (map? cluster) cluster (fetch cluster)))
(loop
(clusters (get-in cluster-rec ["clusters" "clusters"]) total-correct 0 total 0)
(if (= clusters [])
(/ total-correct total)
(let
(rec (head clusters)
cluster-dataset (create-and-wait-dataset {
"cluster" cluster
"centroid" (get rec "id")})
cluster-class (get-in rec ["center" category-id])
num-correct (class-count cluster-dataset category-id cluster-class))
(recur (tail clusters) (+ total-correct num-correct) (+ total (get rec "count")))
)
)
)
)
)
;
; Given a dataset, categorical id and a scale, creates
; a cluster with k = number of classes of the categorical
; field and with the appropriate scale. We then return the
; class purity of the cluster.
;
(define (purity-at-scale dataset category-id scale)
(let
(dataset-rec (if (map? dataset) dataset (fetch dataset))
num-classes (count (get-in dataset-rec [ "fields" category-id "summary" "categories" ]))
field-scales (assoc {} category-id scale)
cluster (create-and-wait-cluster {
"dataset" dataset
"k" num-classes
"field_scales" field-scales}))
(log-info "Cluster: " cluster)
(class-purity cluster category-id)
)
)
;
; Uses a binary search between max/min possible values of scale
; to find the minimum scale required to ensure class purity in the cluster
; for the specified categorical field.
;
(define (purity-min-scale dataset category-id min-scale max-scale min-accuracy max-loop)
(log-info "Analyzing category-id " category-id)
(loop
(lower min-scale upper max-scale loop-count 0)
(log-info "Loop " loop-count " upper-scale=" upper " lower-scale=" lower)
(let
( diff (- upper lower)
mid-point (+ lower (/ diff 2))
purity (purity-at-scale dataset category-id mid-point))
(log-info "Scale " mid-point " Purity " purity)
(if (or (<= (/ diff 2) min-accuracy) (> loop-count max-loop))
{"dataset" dataset
"category-id" category-id
"category-name" (get-in (fetch dataset) [ "fields" category-id "name" ])
"min-scale" (if (= (floor purity) 1) mid-point upper)
"loop-count" loop-count
"stop-criteria" (if (> loop-count max-loop) "max-loop" "min-accuracy")
"accuracy" (/ diff 2)}
(if (= (floor purity) 1)
(recur lower mid-point (+ loop-count 1))
(recur mid-point upper (+ loop-count 1))
)
)
)
)
)
;
; Loops over the specified categorical fields and finds the minimum
; scale for class purity for each.
;
(define (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop)
(let
(categoricals
(if (or (empty? fields) (= fields []))
(all-categoricals dataset)
(find-categoricals dataset fields)
)
)
(loop
(ids categoricals results [])
(if (= ids [])
results
(let
(id (head ids))
(recur (tail ids) (append results (purity-min-scale dataset id min-scale max-scale min-accuracy max-loop)))
)
)
)
)
)
(define min-scales (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment