whizzmler/find-neighbors.json

## readme.md

      
    Raw
  

              readme.md
            
          
    Finding neighbors to a given instance within a cluster

This script takes as inputs a cluster identifier, an instance, i.e.,
a map with values for all fields used by the cluster, and a positive
count n.  It then:

Finds the centroid in the cluster closer to the given instance p
Selects within that centroid's dataset the n instances that are closest to p
If there are less than n rows in the centroid's dataset, missing instances
are read from the next closest centroid.

This workflow uses flatline to compute the distance between p and
the centroid datasets (via the row-distance-squared flatline function)
and add an extra column to the dataset, and then creates a sample of
the result, ordered by the computed distance.

  
## find-neighbors.json
{
  "name": "Find neighbors",
  "description": "Find the closest cluster rows to a given one",
  "inputs": [
    {"name": "cluster-id", "type": "cluster-id", "description": "The cluster to select rows from"},
    {"name": "n", "type": "number", "description": "The number of points to return"},
    {"name": "instance", "type": "map", "description": "Base row to compute distances, as a map from field identifiers to values"}
  ],
  "outputs": [
    {"name": "rows", "type": "list", "description": "The list of the n closest rows to `base`"}
  ]
}

## find-neighbors.whizzml
;; Given a cluster and a point, find the closet n
;; rows in the cluster's dataset(s).

;; Given a cluster and an instance with the same fields as its dataset,
;; order the cluster's centroids by increasing distance to the instance.
(define (ordered-centroids cluster instance)
  (let (descs (get-in cluster ["clusters" "fields"])
        weights (get cluster "scales")
        cs (map (lambda (cn)
                  (let (c (get cn "center")
                        d (row-distance-squared descs weights c instance))
                  (assoc c "distance" (or d -1) "id" (get cn "id"))))
                (get-in cluster ["clusters" "clusters"]))
        cs (filter (lambda (c) (positive? (get c "distance"))) cs))
    (sort-by-key "distance" cs)))

;; Auxiliary function for error signaling.
(define (raise-missing id)
  (raise {"message" (str "Missing input field: " id) "code" -1}))

;; Auxiliary function: constructs the flatline string that generates a
;; new field with the distance of each row to the given one.
(define (distance-flatline cluster instance)
  (let (ids (get cluster "input_fields")
        ps (map (lambda (id) (or (get instance id) (raise-missing id))) ids)
        scales (get cluster "scales")
        ws (map (lambda (id) (get scales id 1)) ids))
    (flatline "(row-distance-squared (list @{{ps}}) (all) (list @{{ws}}))")))

;; Given a cluster and one of its centroids, uses the flatline
;; string generated by `distance-flatline` to create a new
;; dataset that extend's the centroid dataset with a distance
;; column.
(define (generate-distance-dataset cluster cent fl)
  (let (cluster-id (get cluster "resource")
        id (or (get cent "id") (raise (str "No id in " cent)))
        ds-id (get-in cluster ["cluster_datasets" id])
        ds-id (if (or (not ds-id) (empty? ds-id))
                  (create-and-wait-dataset {"cluster" cluster-id "centroid" id})
                  (str "dataset/" ds-id)))
    (create-and-wait-dataset {"origin_dataset" ds-id
                              "new_fields" [{"name" "distance" "field" fl}]})))

;; Given an extended centroid dataset (created by
;; `generate-distance-dataset`), returns the list of `n` rows with the
;; smallest values in the distance column.
(define (fetch-dataset-instances ds-id n)
  (let (sample-id (create-and-wait-sample {"dataset" ds-id})
        obj-id (dataset-get-objective-id ds-id)
        sample (fetch sample-id {"row_order_by" obj-id
                                 "rows" n
                                 "mode" "linear"})
        rows (get-in sample ["sample" "rows"] []))
    (delete sample-id)
    rows))

;; Final workflow.
(define (find-neighbors cluster-id instance n)
  (let (cluster (fetch cluster-id {"limit" -1})
        fl (distance-flatline cluster instance))
    (loop (cps (ordered-centroids cluster instance) instances [] m n)
      (cond (empty? cps) instances
            (< m 1) instances
            (let (ds-id (generate-distance-dataset cluster (head cps) fl)
                  new-instances (fetch-dataset-instances ds-id m))
              (recur (tail cps)
                     (concat instances new-instances)
                     (- m (count new-instances))))))))

;; Inputs and outputs
(define rows (find-neighbors cluster-id instance n))
	{
	"name": "Find neighbors",
	"description": "Find the closest cluster rows to a given one",
	"inputs": [
	{"name": "cluster-id", "type": "cluster-id", "description": "The cluster to select rows from"},
	{"name": "n", "type": "number", "description": "The number of points to return"},
	{"name": "instance", "type": "map", "description": "Base row to compute distances, as a map from field identifiers to values"}
	],
	"outputs": [
	{"name": "rows", "type": "list", "description": "The list of the n closest rows to `base`"}
	]
	}
	;; Given a cluster and a point, find the closet n
	;; rows in the cluster's dataset(s).

	;; Given a cluster and an instance with the same fields as its dataset,
	;; order the cluster's centroids by increasing distance to the instance.
	(define (ordered-centroids cluster instance)
	(let (descs (get-in cluster ["clusters" "fields"])
	weights (get cluster "scales")
	cs (map (lambda (cn)
	(let (c (get cn "center")
	d (row-distance-squared descs weights c instance))
	(assoc c "distance" (or d -1) "id" (get cn "id"))))
	(get-in cluster ["clusters" "clusters"]))
	cs (filter (lambda (c) (positive? (get c "distance"))) cs))
	(sort-by-key "distance" cs)))

	;; Auxiliary function for error signaling.
	(define (raise-missing id)
	(raise {"message" (str "Missing input field: " id) "code" -1}))

	;; Auxiliary function: constructs the flatline string that generates a
	;; new field with the distance of each row to the given one.
	(define (distance-flatline cluster instance)
	(let (ids (get cluster "input_fields")
	ps (map (lambda (id) (or (get instance id) (raise-missing id))) ids)
	scales (get cluster "scales")
	ws (map (lambda (id) (get scales id 1)) ids))
	(flatline "(row-distance-squared (list @{{ps}}) (all) (list @{{ws}}))")))

	;; Given a cluster and one of its centroids, uses the flatline
	;; string generated by `distance-flatline` to create a new
	;; dataset that extend's the centroid dataset with a distance
	;; column.
	(define (generate-distance-dataset cluster cent fl)
	(let (cluster-id (get cluster "resource")
	id (or (get cent "id") (raise (str "No id in " cent)))
	ds-id (get-in cluster ["cluster_datasets" id])
	ds-id (if (or (not ds-id) (empty? ds-id))
	(create-and-wait-dataset {"cluster" cluster-id "centroid" id})
	(str "dataset/" ds-id)))
	(create-and-wait-dataset {"origin_dataset" ds-id
	"new_fields" [{"name" "distance" "field" fl}]})))

	;; Given an extended centroid dataset (created by
	;; `generate-distance-dataset`), returns the list of `n` rows with the
	;; smallest values in the distance column.
	(define (fetch-dataset-instances ds-id n)
	(let (sample-id (create-and-wait-sample {"dataset" ds-id})
	obj-id (dataset-get-objective-id ds-id)
	sample (fetch sample-id {"row_order_by" obj-id
	"rows" n
	"mode" "linear"})
	rows (get-in sample ["sample" "rows"] []))
	(delete sample-id)
	rows))

	;; Final workflow.
	(define (find-neighbors cluster-id instance n)
	(let (cluster (fetch cluster-id {"limit" -1})
	fl (distance-flatline cluster instance))
	(loop (cps (ordered-centroids cluster instance) instances [] m n)
	(cond (empty? cps) instances
	(< m 1) instances
	(let (ds-id (generate-distance-dataset cluster (head cps) fl)
	new-instances (fetch-dataset-instances ds-id m))
	(recur (tail cps)
	(concat instances new-instances)
	(- m (count new-instances))))))))

	;; Inputs and outputs
	(define rows (find-neighbors cluster-id instance n))