Skip to content

Instantly share code, notes, and snippets.

@whizzmler
Last active May 14, 2016 10:37
Show Gist options
  • Save whizzmler/e1e80a9ccee28565be6dc3fe1cf4a73e to your computer and use it in GitHub Desktop.
Save whizzmler/e1e80a9ccee28565be6dc3fe1cf4a73e to your computer and use it in GitHub Desktop.
Cluster neighbors

Finding neighbors to a given instance within a cluster

This script takes as inputs a cluster identifier, an instance, i.e., a map with values for all fields used by the cluster, and a positive count n. It then:

  • Finds the centroid in the cluster closer to the given instance p
  • Selects within that centroid's dataset the n instances that are closest to p
  • If there are less than n rows in the centroid's dataset, missing instances are read from the next closest centroid.

This workflow uses flatline to compute the distance between p and the centroid datasets (via the row-distance-squared flatline function) and add an extra column to the dataset, and then creates a sample of the result, ordered by the computed distance.

{
"name": "Find neighbors",
"description": "Find the closest cluster rows to a given one",
"inputs": [
{"name": "cluster-id", "type": "cluster-id", "description": "The cluster to select rows from"},
{"name": "n", "type": "number", "description": "The number of points to return"},
{"name": "instance", "type": "map", "description": "Base row to compute distances, as a map from field identifiers to values"}
],
"outputs": [
{"name": "rows", "type": "list", "description": "The list of the n closest rows to `base`"}
]
}
;; Given a cluster and a point, find the closet n
;; rows in the cluster's dataset(s).
;; Given a cluster and an instance with the same fields as its dataset,
;; order the cluster's centroids by increasing distance to the instance.
(define (ordered-centroids cluster instance)
(let (descs (get-in cluster ["clusters" "fields"])
weights (get cluster "scales")
cs (map (lambda (cn)
(let (c (get cn "center")
d (row-distance-squared descs weights c instance))
(assoc c "distance" (or d -1) "id" (get cn "id"))))
(get-in cluster ["clusters" "clusters"]))
cs (filter (lambda (c) (positive? (get c "distance"))) cs))
(sort-by-key "distance" cs)))
;; Auxiliary function for error signaling.
(define (raise-missing id)
(raise {"message" (str "Missing input field: " id) "code" -1}))
;; Auxiliary function: constructs the flatline string that generates a
;; new field with the distance of each row to the given one.
(define (distance-flatline cluster instance)
(let (ids (get cluster "input_fields")
ps (map (lambda (id) (or (get instance id) (raise-missing id))) ids)
scales (get cluster "scales")
ws (map (lambda (id) (get scales id 1)) ids))
(flatline "(row-distance-squared (list @{{ps}}) (all) (list @{{ws}}))")))
;; Given a cluster and one of its centroids, uses the flatline
;; string generated by `distance-flatline` to create a new
;; dataset that extend's the centroid dataset with a distance
;; column.
(define (generate-distance-dataset cluster cent fl)
(let (cluster-id (get cluster "resource")
id (or (get cent "id") (raise (str "No id in " cent)))
ds-id (get-in cluster ["cluster_datasets" id])
ds-id (if (or (not ds-id) (empty? ds-id))
(create-and-wait-dataset {"cluster" cluster-id "centroid" id})
(str "dataset/" ds-id)))
(create-and-wait-dataset {"origin_dataset" ds-id
"new_fields" [{"name" "distance" "field" fl}]})))
;; Given an extended centroid dataset (created by
;; `generate-distance-dataset`), returns the list of `n` rows with the
;; smallest values in the distance column.
(define (fetch-dataset-instances ds-id n)
(let (sample-id (create-and-wait-sample {"dataset" ds-id})
obj-id (dataset-get-objective-id ds-id)
sample (fetch sample-id {"row_order_by" obj-id
"rows" n
"mode" "linear"})
rows (get-in sample ["sample" "rows"] []))
(delete sample-id)
rows))
;; Final workflow.
(define (find-neighbors cluster-id instance n)
(let (cluster (fetch cluster-id {"limit" -1})
fl (distance-flatline cluster instance))
(loop (cps (ordered-centroids cluster instance) instances [] m n)
(cond (empty? cps) instances
(< m 1) instances
(let (ds-id (generate-distance-dataset cluster (head cps) fl)
new-instances (fetch-dataset-instances ds-id m))
(recur (tail cps)
(concat instances new-instances)
(- m (count new-instances))))))))
;; Inputs and outputs
(define rows (find-neighbors cluster-id instance n))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment