Skip to content

Instantly share code, notes, and snippets.

@ashenfad
Last active September 25, 2015 20:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashenfad/979783 to your computer and use it in GitHub Desktop.
Save ashenfad/979783 to your computer and use it in GitHub Desktop.
Streaming (one-pass) technique for sampling with replacement
;; For a fully fleshed out library, see:
;; https://github.com/bigmlcom/sampling
;; -------------------------------------
;; see http://ashenfad.blogspot.com/2011/06/single-pass-sampling-with-replacement.html
(ns sample.replacement
(:use [clojure.contrib.math :only [expt]]))
(defn- choose [a b]
(let [numerator (apply * (range (- (inc a) b) (inc a)))
denominator (apply * (range 1 (inc b)))]
(/ numerator denominator)))
(defn- find-occurance-prob [population-size sample-size occurances]
(let [single-select-prob (double (/ 1 population-size))
single-unselect-prob (- 1 single-select-prob)
occurance-prob (expt single-select-prob occurances)
non-occurances (- sample-size occurances)
non-occurance-prob (expt single-unselect-prob non-occurances)
combinations (choose sample-size occurances)
probability (* occurance-prob non-occurance-prob combinations)]
probability))
(defn- normalize-dist [dist-seq]
(let [sum (apply + (map second dist-seq))]
(map (fn [x] [(first x) (/ (second x) sum)]) dist-seq)))
(defn- occurance-dist [population-size sample-size]
(loop [cd-value 0
distribution []]
(if (< cd-value 0.999999)
(let [occurances (count distribution)
occurance-prob (find-occurance-prob
population-size
sample-size
occurances)]
(recur
(+ cd-value occurance-prob)
(conj distribution [occurances occurance-prob])))
(normalize-dist distribution))))
(defn occurance-map [population-size sample-size]
(let [init-dist-seq (occurance-dist population-size sample-size)]
(loop [sum 0
dist-seq init-dist-seq
dist-map (sorted-map)]
(let [current-occurance (first dist-seq)
rest-dist-seq (rest dist-seq)
occurance-count (first current-occurance)
occurance-sum (+ sum (second current-occurance))
new-dist-map (assoc dist-map occurance-sum occurance-count)]
(if (seq rest-dist-seq)
(recur occurance-sum rest-dist-seq new-dist-map)
new-dist-map)))))
(defn- roll-occurances [dist-map]
(second (first (subseq dist-map >= (rand)))))
(defn- sample-occurances [dist-map datum]
(take (roll-occurances dist-map) (repeat datum)))
(defn sample-with-replacement-dist [dist-map data]
(mapcat
(fn [datum] (sample-occurances dist-map datum))
data))
(defn sample-with-replacement [population-size sample-size data]
(sample-with-replacement-dist
(occurance-map population-size sample-size)
(take population-size data)))
; example:
;(def population-size 100000)
;(def sample-size 10000)
;(def data (take population-size (repeatedly (fn [] (rand-int 10000)))))
;(def sample (sample-with-replacement population-size sample-size data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment