petersen-poul/min-scale-class-purity.json

## min-scale-class-purity.json
{
    "name": "Minimum Scale for Cluster Class Purity",
    "description": "Given a dataset and a categorical field, finds the minimum scale required to create class purity in the cluster with k = number of classes.",
    "inputs": [
        {
            "name": "dataset",
            "type": "dataset-id",
            "description": "Dataset to analyze."
        },
        {
            "name": "fields",
            "type": "list",
            "description": "A list of field names or ids to analyze for minimum scale class purity. If emtpy, all categorical values will be analyzed.",
            "default": []
        },
        {
            "name": "min-scale",
            "type": "number",
            "description": "Specifies the smallest scale value to check.",
            "default": 0
        },
        {
            "name": "max-scale",
            "type": "number",
            "description": "Specifies the largest scale value to check.",
            "default": 100
        },
        {
            "name": "min-accuracy",
            "type": "number",
            "description": "Each loop halves the range of possible scales. When the range is smaller than the accuracy, the search will stop.",
            "default": 0.01
        },
        {
            "name": "max-loop",
            "type": "number",
            "description": "Stops after a fixed number of loops of searching for each categorical value.",
            "default": 25
        }
    ],
    "outputs": [
        {
            "name": "min-scales",
            "type": "list",
            "description": "A list containing the minimum scales, along with other details, for each categorical value tested for class purity."
        }
    ]
}

## min-scale-class-purity.whizzml
;
; Given a dataset map or resource id, returns a list of
; field ids for all categorical fields in input_fields
;
(define (all-categoricals dataset)
  (let
    (dataset-rec (if (map? dataset) dataset (fetch dataset)))
    (loop
      (ids (get dataset-rec "input_fields") categoricals [])
      (if (= ids [])
        categoricals
        (let
          (id (head ids))
          (if (= (get-in dataset-rec [ "fields" id "optype" ]) "categorical")
            (recur (tail ids) (append categoricals id))
            (recur (tail ids) categoricals)
          )
        )
      )
    )
  )
)

;
; Given a dataset and a field identifier returns
; a list of categoricals that match either the field name
; or id. Normally this is should be one item, but if a field
; happens to have a name that looks like a field id (ex: "000001")
; then there could be multiple matches.
;
(define (find-categorical dataset identifier)
  (let
    (dataset-rec (if (map? dataset) dataset (fetch dataset))
     field_ids (get dataset-rec "input_fields"))
    (loop (ids field_ids result [])
      (if (= ids [])
        result
        (let
          (id (head ids)
           field-rec (get-in dataset-rec ["fields" id]))
          (if (and (or (= id identifier) (= (get field-rec "name") identifier)) (= (get field-rec "optype") "categorical"))
            (recur (tail ids) (append result id))
            (recur (tail ids) result)
          )
        )
      )
    )
  )
)

;
; Given a dataset and a list of identifiers, possibly
; empty, returns a list of all categoricals that match
; either by name or id.
;
(define (find-categoricals dataset identifiers)
  (let
    (dataset-rec (if (map? dataset) dataset (fetch dataset)))
    (if (or (= identifiers []) (empty? identifiers))
      (all-categoricals dataset)
      (loop (ids identifiers categoricals [])
        (if (= ids [])
          categoricals
          (recur (tail ids) (concat categoricals (find-categorical dataset (head ids))))
        )
      )
    )
  )
)

;
; Given a dataset and the id of a categorical value
; returns the number of instances in the dataset
; with the specified class.
;
(define (class-count dataset category-id class)
  (let
    (dataset-rec (if (map? dataset) dataset (fetch dataset))
     categories (get-in dataset-rec ["fields" category-id "summary" "categories"]))
    (loop
      (to-check categories)
      (if (= to-check [])
        0
        (let
          (category (head to-check))
          (if (= (nth category 0) class)
            (nth category 1)
            (recur (tail to-check))
          )
        )
      )
    )
  )
)

;
; Given a cluster, computes the class purity of the specified
; categorical field between cluster groups.
;
; For example, if the categorical value is TRUE/FALSE then
; we loop over the cluster groups and for each check the centroid
; say it is TRUE, then count the number of instance in that
; custer group that are TRUE.
;
(define (class-purity cluster category-id)
  (let
    (cluster-rec (if (map? cluster) cluster (fetch cluster)))
    (loop
      (clusters (get-in cluster-rec ["clusters" "clusters"]) total-correct 0 total 0)
      (if (= clusters [])
        (/ total-correct total)
        (let
          (rec (head clusters)
           cluster-dataset (create-and-wait-dataset {
             "cluster" cluster
             "centroid" (get rec "id")})
           cluster-class (get-in rec ["center" category-id])
           num-correct (class-count cluster-dataset category-id cluster-class))
          (recur (tail clusters) (+ total-correct num-correct) (+ total (get rec "count")))
        )
      )
    )
  )
)

;
; Given a dataset, categorical id and a scale, creates
; a cluster with k = number of classes of the categorical
; field and with the appropriate scale. We then return the
; class purity of the cluster.
;
(define (purity-at-scale dataset category-id scale)
  (let
    (dataset-rec (if (map? dataset) dataset (fetch dataset))
     num-classes (count (get-in dataset-rec [ "fields" category-id "summary" "categories" ]))
     field-scales (assoc {} category-id scale)
     cluster (create-and-wait-cluster {
       "dataset" dataset
       "k" num-classes
       "field_scales" field-scales}))
    (log-info "Cluster: " cluster)
    (class-purity cluster category-id)
  )
)

;
; Uses a binary search between max/min possible values of scale
; to find the minimum scale required to ensure class purity in the cluster
; for the specified categorical field.
;
(define (purity-min-scale dataset category-id min-scale max-scale min-accuracy max-loop)
  (log-info "Analyzing category-id " category-id)
  (loop
    (lower min-scale upper max-scale loop-count 0)
    (log-info "Loop " loop-count " upper-scale=" upper " lower-scale=" lower)
    (let
      ( diff (- upper lower)
        mid-point (+ lower (/ diff 2))
        purity (purity-at-scale dataset category-id mid-point))
      (log-info "Scale " mid-point " Purity " purity)
      (if (or (<= (/ diff 2) min-accuracy) (> loop-count max-loop))
        {"dataset" dataset
          "category-id" category-id
          "category-name" (get-in (fetch dataset) [ "fields" category-id "name" ])
          "min-scale" (if (= (floor purity) 1) mid-point upper)
          "loop-count" loop-count
          "stop-criteria" (if (> loop-count max-loop) "max-loop" "min-accuracy")
          "accuracy" (/ diff 2)}
        (if (= (floor purity) 1)
          (recur lower mid-point (+ loop-count 1))
          (recur mid-point upper (+ loop-count 1))
        )
      )
    )
  )
)

;
; Loops over the specified categorical fields and finds the minimum
; scale for class purity for each.
;

(define (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop)
  (let
    (categoricals
      (if (or (empty? fields) (= fields []))
        (all-categoricals dataset)
        (find-categoricals dataset fields)
      )
    )
    (loop
      (ids categoricals results [])
      (if (= ids [])
        results
        (let
          (id (head ids))
          (recur (tail ids) (append results (purity-min-scale dataset id min-scale max-scale min-accuracy max-loop)))
        )
      )
    )
  )
)

(define min-scales (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop))
	{
	"name": "Minimum Scale for Cluster Class Purity",
	"description": "Given a dataset and a categorical field, finds the minimum scale required to create class purity in the cluster with k = number of classes.",
	"inputs": [
	{
	"name": "dataset",
	"type": "dataset-id",
	"description": "Dataset to analyze."
	},
	{
	"name": "fields",
	"type": "list",
	"description": "A list of field names or ids to analyze for minimum scale class purity. If emtpy, all categorical values will be analyzed.",
	"default": []
	},
	{
	"name": "min-scale",
	"type": "number",
	"description": "Specifies the smallest scale value to check.",
	"default": 0
	},
	{
	"name": "max-scale",
	"type": "number",
	"description": "Specifies the largest scale value to check.",
	"default": 100
	},
	{
	"name": "min-accuracy",
	"type": "number",
	"description": "Each loop halves the range of possible scales. When the range is smaller than the accuracy, the search will stop.",
	"default": 0.01
	},
	{
	"name": "max-loop",
	"type": "number",
	"description": "Stops after a fixed number of loops of searching for each categorical value.",
	"default": 25
	}
	],
	"outputs": [
	{
	"name": "min-scales",
	"type": "list",
	"description": "A list containing the minimum scales, along with other details, for each categorical value tested for class purity."
	}
	]
	}
	;
	; Given a dataset map or resource id, returns a list of
	; field ids for all categorical fields in input_fields
	;
	(define (all-categoricals dataset)
	(let
	(dataset-rec (if (map? dataset) dataset (fetch dataset)))
	(loop
	(ids (get dataset-rec "input_fields") categoricals [])
	(if (= ids [])
	categoricals
	(let
	(id (head ids))
	(if (= (get-in dataset-rec [ "fields" id "optype" ]) "categorical")
	(recur (tail ids) (append categoricals id))
	(recur (tail ids) categoricals)
	)
	)
	)
	)
	)
	)

	;
	; Given a dataset and a field identifier returns
	; a list of categoricals that match either the field name
	; or id. Normally this is should be one item, but if a field
	; happens to have a name that looks like a field id (ex: "000001")
	; then there could be multiple matches.
	;
	(define (find-categorical dataset identifier)
	(let
	(dataset-rec (if (map? dataset) dataset (fetch dataset))
	field_ids (get dataset-rec "input_fields"))
	(loop (ids field_ids result [])
	(if (= ids [])
	result
	(let
	(id (head ids)
	field-rec (get-in dataset-rec ["fields" id]))
	(if (and (or (= id identifier) (= (get field-rec "name") identifier)) (= (get field-rec "optype") "categorical"))
	(recur (tail ids) (append result id))
	(recur (tail ids) result)
	)
	)
	)
	)
	)
	)

	;
	; Given a dataset and a list of identifiers, possibly
	; empty, returns a list of all categoricals that match
	; either by name or id.
	;
	(define (find-categoricals dataset identifiers)
	(let
	(dataset-rec (if (map? dataset) dataset (fetch dataset)))
	(if (or (= identifiers []) (empty? identifiers))
	(all-categoricals dataset)
	(loop (ids identifiers categoricals [])
	(if (= ids [])
	categoricals
	(recur (tail ids) (concat categoricals (find-categorical dataset (head ids))))
	)
	)
	)
	)
	)

	;
	; Given a dataset and the id of a categorical value
	; returns the number of instances in the dataset
	; with the specified class.
	;
	(define (class-count dataset category-id class)
	(let
	(dataset-rec (if (map? dataset) dataset (fetch dataset))
	categories (get-in dataset-rec ["fields" category-id "summary" "categories"]))
	(loop
	(to-check categories)
	(if (= to-check [])
	0
	(let
	(category (head to-check))
	(if (= (nth category 0) class)
	(nth category 1)
	(recur (tail to-check))
	)
	)
	)
	)
	)
	)

	;
	; Given a cluster, computes the class purity of the specified
	; categorical field between cluster groups.
	;
	; For example, if the categorical value is TRUE/FALSE then
	; we loop over the cluster groups and for each check the centroid
	; say it is TRUE, then count the number of instance in that
	; custer group that are TRUE.
	;
	(define (class-purity cluster category-id)
	(let
	(cluster-rec (if (map? cluster) cluster (fetch cluster)))
	(loop
	(clusters (get-in cluster-rec ["clusters" "clusters"]) total-correct 0 total 0)
	(if (= clusters [])
	(/ total-correct total)
	(let
	(rec (head clusters)
	cluster-dataset (create-and-wait-dataset {
	"cluster" cluster
	"centroid" (get rec "id")})
	cluster-class (get-in rec ["center" category-id])
	num-correct (class-count cluster-dataset category-id cluster-class))
	(recur (tail clusters) (+ total-correct num-correct) (+ total (get rec "count")))
	)
	)
	)
	)
	)

	;
	; Given a dataset, categorical id and a scale, creates
	; a cluster with k = number of classes of the categorical
	; field and with the appropriate scale. We then return the
	; class purity of the cluster.
	;
	(define (purity-at-scale dataset category-id scale)
	(let
	(dataset-rec (if (map? dataset) dataset (fetch dataset))
	num-classes (count (get-in dataset-rec [ "fields" category-id "summary" "categories" ]))
	field-scales (assoc {} category-id scale)
	cluster (create-and-wait-cluster {
	"dataset" dataset
	"k" num-classes
	"field_scales" field-scales}))
	(log-info "Cluster: " cluster)
	(class-purity cluster category-id)
	)
	)

	;
	; Uses a binary search between max/min possible values of scale
	; to find the minimum scale required to ensure class purity in the cluster
	; for the specified categorical field.
	;
	(define (purity-min-scale dataset category-id min-scale max-scale min-accuracy max-loop)
	(log-info "Analyzing category-id " category-id)
	(loop
	(lower min-scale upper max-scale loop-count 0)
	(log-info "Loop " loop-count " upper-scale=" upper " lower-scale=" lower)
	(let
	( diff (- upper lower)
	mid-point (+ lower (/ diff 2))
	purity (purity-at-scale dataset category-id mid-point))
	(log-info "Scale " mid-point " Purity " purity)
	(if (or (<= (/ diff 2) min-accuracy) (> loop-count max-loop))
	{"dataset" dataset
	"category-id" category-id
	"category-name" (get-in (fetch dataset) [ "fields" category-id "name" ])
	"min-scale" (if (= (floor purity) 1) mid-point upper)
	"loop-count" loop-count
	"stop-criteria" (if (> loop-count max-loop) "max-loop" "min-accuracy")
	"accuracy" (/ diff 2)}
	(if (= (floor purity) 1)
	(recur lower mid-point (+ loop-count 1))
	(recur mid-point upper (+ loop-count 1))
	)
	)
	)
	)
	)

	;
	; Loops over the specified categorical fields and finds the minimum
	; scale for class purity for each.
	;

	(define (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop)
	(let
	(categoricals
	(if (or (empty? fields) (= fields []))
	(all-categoricals dataset)
	(find-categoricals dataset fields)
	)
	)
	(loop
	(ids categoricals results [])
	(if (= ids [])
	results
	(let
	(id (head ids))
	(recur (tail ids) (append results (purity-min-scale dataset id min-scale max-scale min-accuracy max-loop)))
	)
	)
	)
	)
	)

	(define min-scales (purity-min-scales dataset fields min-scale max-scale min-accuracy max-loop))