Skip to content

Instantly share code, notes, and snippets.

@mthomure
Last active January 18, 2017 01:10
Show Gist options
  • Save mthomure/1d6399376d4ff0c90de3fe253fc88174 to your computer and use it in GitHub Desktop.
Save mthomure/1d6399376d4ff0c90de3fe253fc88174 to your computer and use it in GitHub Desktop.
simple k-means in clojure
(ns k-means
(:import [org.apache.commons.math3.ml.clustering
Clusterable KMeansPlusPlusClusterer]
[org.apache.commons.math3.ml.distance EuclideanDistance]))
;; dependency: [org.apache.commons/commons-math3 "3.6.1"]
(defrecord ClusterableWrapper [x]
Clusterable
(getPoint [this] x))
(defn k-means [features & {:keys [k max-iterations]
:or {k 10
max-iterations 10000}}]
(let [features (mapv #(ClusterableWrapper. (double-array %)) features)
model (KMeansPlusPlusClusterer. k max-iterations)
centroids (.cluster model features)
;; reverse lookup from point to centroid index
cluster-index (into {}
(for [[i c] (map vector (range) centroids)
p (.getPoints c)]
[p i]))]
{:centroids (map #(into [] (.getPoint (.getCenter %))) centroids)
:assignments (map cluster-index features)}))
(defn- k-means-assign-1
[measure centroids point]
(->> centroids
(map list (range))
(apply min-key #(.compute measure point (second %)))
first))
(defn k-means-assign
"Index of nearest centroid."
[centroids points]
(let [measure (EuclideanDistance.)
centroids (map double-array centroids)]
(map #(k-means-assign-1 measure centroids (double-array %)) points)))
(require '[k-means :refer :all])
(defn blob
"Isotropic Gaussian blobs for clustering."
[& {:keys [num-samples num-features cluster-std center-box]
:or {num-samples 12
num-features 2
cluster-std 1.0
center-box [-10.0 10.0]}}]
(let [[cmin cmax] center-box
crange (- cmax cmin)
center (repeatedly num-features #(-> (rand) (* crange) (+ cmin)))
points (->> (repeatedly #(* cluster-std (rand)))
(partition num-features)
(take num-samples)
(map #(mapv + center %)))]
{:centroid center
:points points}))
(def blobs (repeatedly 3 #(blob :num-samples 2)))
;; ({:centroid (3.6065806787961474 -3.4831673166556465),
;; :points
;; ([3.938402488879002 -2.574654565134169]
;; [4.41362664557822 -3.4196443259839007])}
;; {:centroid (-9.33722290094038 -7.7122443338667885),
;; :points
;; ([-8.60315679311847 -7.352826777403295]
;; [-8.58933442539004 -7.634666257054288])}
;; {:centroid (-7.611451089070584 -3.6657499059097542),
;; :points
;; ([-7.331312967407008 -2.7336492340643788]
;; [-6.640609061939679 -2.974514030116507])})
(def model (k-means (mapcat :points blobs) :k 3))
;; {:centroids
;; ([-8.596245609254256 -7.493746517228791]
;; [4.176014567228611 -2.997149445559035]
;; [-6.985961014673343 -2.854081632090443]),
;; :assignments (1 1 0 0 2 2)}
(k-means-assign (:centroids model) (mapcat :points blobs))
;; (1 1 0 0 2 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment