Skip to content

Instantly share code, notes, and snippets.

@ericnormand
Last active July 21, 2019 20:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericnormand/8374135db38d435bfb97a369f49ed8e1 to your computer and use it in GitHub Desktop.
Save ericnormand/8374135db38d435bfb97a369f49ed8e1 to your computer and use it in GitHub Desktop.

calculate quartiles

It's often useful to know the quartiles of a dataset. The quartiles define the lowest quarter, lowest half, and lowest three quarters data points. Those are typically called q1, q2, and q3. I also like to include q0 and q4, which are the min and max.

q2 is just the median of the dataset. q1 is the median of the lower half of the dataset. And q3 is the median of the upper half of the dataset.

Write a function which calculates the quartiles of a sequence of numbers. You will need to be able to calculate the median (which you can look up easily).

(defn average [a b]
(/ (+ a b) 2))
(defn median [sorted-vec]
(let [h (quot (count sorted-vec) 2)]
(if (even? (count sorted-vec))
(average (get sorted-vec (dec h)) (get sorted-vec h))
(get sorted-vec h))))
(defn halves [sorted-vec]
(let [h (quot (count sorted-vec) 2)]
(if (even? (count sorted-vec))
[(subvec sorted-vec 0 h)
(subvec sorted-vec h)]
[(subvec sorted-vec 0 (inc h))
(subvec sorted-vec h)])))
(defn quartiles
"Calculate the quartiles with Method 2 from https://en.wikipedia.org/wiki/Quartile"
[numbers]
(let [sorted-vec (vec (sort numbers))
[h1 h2] (halves sorted-vec)]
{:q0 (first sorted-vec)
:q1 (median h1)
:q2 (median sorted-vec)
:q3 (median h2)
:q4 (last sorted-vec)}))
(ns clj-challenge.quartiles)
(defn median
"Returns the median value of a collection. The count `cnt` can be passed in to
save time."
([coll]
(median coll (count coll)))
([coll cnt]
(cond
(odd? cnt) (nth coll (/ cnt 2))
(even? cnt) (/ (+ (nth coll (/ cnt 2))
(nth coll (/ (- cnt 1) 2)))
2))))
(defn halves
"Returns the ((left) (right)) halves of a collection. The count `cnt` can be passed
in to save time."
([coll]
(halves coll (count coll)))
([coll cnt]
(cond
(odd? cnt) (let [[left right] (split-at (/ cnt 2) coll)]
(list (butlast left) right))
(even? cnt) (split-at (/ cnt 2) coll))))
(defn quartiles
"Returns the quartiles of a collection, like this:
{:q0 1 ;; the min
:q1 2 ;; 25th percentile
:q2 3 ;; the median
:q3 4 ;; 75th percentile
:q4 5} ;; the max"
[coll]
(let [sorted (sort coll)
cnt (count sorted)
cnt-half (quot cnt 2)
[left right] (halves sorted cnt)]
{:q0 (first sorted)
:q1 (median left cnt-half)
:q2 (median sorted cnt)
:q3 (median right cnt-half)
:q4 (last sorted)}))
;; REPL expressions to try things out
(comment
;; Sample data
(def even [1 10 2 9 3 8 4 7 5 6])
(def odd [1 10 2 9 3 8 4 7 5 6 11])
(def big (take 1000000 (map rand-int (repeat 1000000))))
;; Get median, halves
(median even)
(median odd)
(halves even)
(halves odd)
;; Even number of elements
(quartiles even)
;; Odd number of elements
(quartiles odd)
;; Big random numbers
(time (quartiles big)))
(ns scratchpad.pftv.335
(:require [clojure.string :as str]
[clojure.test :refer [deftest is testing run-tests]]))
;; uses "Method 1" from https://en.wikipedia.org/wiki/Quartile#Computing_methods
;; assumes v is a sorted vec
(defn median* [v]
(if (not (empty? v))
(let [len (count v)
mid (quot len 2)]
(if (even? len)
[(/ (+ (nth v (dec mid)) (nth v mid)) 2)
(subvec v 0 mid)
(subvec v mid)]
[(nth v mid)
(subvec v 0 mid)
(subvec v (inc mid))]))))
(defn median [lst]
(first (median* (vec (sort lst)))))
(defn quartiles [lst]
(let [v (vec (sort lst))
q0 (first v)
[q2 left right] (median* v)
[q1 & _] (median* left)
[q3 & _] (median* right)
q4 (last v)]
(if q2
[q0 q1 q2 q3 q4])))
(deftest test-median
(testing "All cases of median"
(is (nil? (median [])))
(is (= (median [1]) 1))
(is (= (median [1 2]) 3/2))
(is (= (median [1 2 3]) 2)))
(testing "Median of Wikipedia example"
(is (= (median [6 7 15 36 39 40 41 42 43 47 49]) 40))))
(deftest test-quartiles
(testing "All cases of quartiles"
(is (nil? (quartiles [])))
(is (= (quartiles [1]) [1 nil 1 nil 1]))
(is (= (quartiles [1 2]) [1 1 3/2 2 2]))
(is (= (quartiles [1 2 3 4 5 6 7]) [1 2 4 6 7])))
(testing "Quartiles of Wikipedia example"
(is (= (quartiles [6 7 15 36 39 40 41 42 43 47 49]) [6 15 40 43 49]))))
(run-tests)
(defn halve [s]
(split-at (int (/ (count s) 2)) (sort s)))
(defn median [s]
(let [[lh [fl & _]] (halve s)]
(if (even? (count s)) (/ (+ (last lh) fl) 2.0) fl)))
; uses quartile method 1 from https://en.wikipedia.org/wiki/Quartile
(defn quartiles [s]
(let [ss (sort s)
[lh uh] (halve s)
uhm (if (even? (count uh)) uh (rest uh))]
{:q0 (first ss)
:q1 (median lh)
:q2 (median ss)
:q3 (median uhm)
:q4 (last ss)}))
;; Tests
;; check against sample data and answer from
;; https://www.statisticshowto.datasciencecentral.com/calculators/interquartile-range-calculator/
(quartiles '(12, 13, 15, 18, 19, 22, 88, 89, 90, 91, 92, 93, 95, 98, 99, 101, 101, 103, 105, 106, 107, 108, 109, 200, 201, 201, 203, 204, 215, 216, 217, 222, 223, 224, 225, 227, 229, 230, 232, 245, 246, 250, 258, 270, 271, 271, 272, 273))
;; {:q0 12, :q1 94.0, :q2 200.5, :q3 228.0, :q4 273}
(quartiles '(1 2 3 4 5))
;; {:q0 1, :q1 1.5, :q2 3, :q3 4.5, :q4 5}
(quartiles '(1 2 3 4 5 6))
;; {:q0 1, :q1 2, :q2 3.5, :q3 5.5, :q4 6}
(quartiles (range 1 100))
;; {:q0 1, :q1 25, :q2 50, :q3 74.5, :q4 99}
(defn genrndnseq [l n]
(repeatedly l #(rand-int n)))
;; runs with increasing numbers of randoms 0-100
(quartiles (genrndnseq 10 100)). ;; {:q0 8, :q1 40, :q2 56.0, :q3 78.0, :q4 89}
(quartiles (genrndnseq 100 100)) ;; {:q0 0, :q1 33.5, :q2 56.5, :q3 82.5, :q4 98}
(quartiles (genrndnseq 1000 100)) ;; {:q0 0, :q1 25.0, :q2 49.0, :q3 74.0, :q4 100}
(quartiles (genrndnseq 10000 100)) ;; {:q0 0, :q1 25.0, :q2 51.0, :q3 75.0, :q4 100}
(quartiles (genrndnseq 100000 100)) ;; {:q0 0, :q1 25.0, :q2 50.0, :q3 75.0, :q4 100}
(ns miner.quartiles)
;; https://purelyfunctional.tv/issues/purelyfunctional-tv-newsletter-335-idea-when-can-you-use-property-based-testing/
;; Quartiles
;;
;; q2 is just the median of the dataset. q1 is the median of the lower half of the
;; dataset. And q3 is the median of the upper half of the dataset. I also like to include
;; q0 and q4, which are the min and max.
;; We have to decide if the median belongs to the lower or upper halves (or neither).
;; I'm using "Tukey's hinges" or what Wikipedia calls "Method 2". If the original data has an even
;; count, split the data exactly in half. If it has an odd count, include the median in
;; both halves. This convention eliminates an awkward edge case for single element data sets.
;;
;; https://en.wikipedia.org/wiki/Quartile
(defn- median-sorted-vec [v]
(let [cnt (count v)
mid (quot cnt 2)]
(if (odd? cnt)
(v mid)
(/ (+ (v (dec mid)) (v mid)) 2.0))))
(defn quartiles [coll]
"Returns a vector of 5 elements based on the collection of numbers `coll`. Q0 is the
minimum, Q1 is the median of the lower half, Q2 is the median, Q3 is the median of the
upper half, and Q4 is the maximum. Returns nil for the empty collection. The halves are
split according to the *Tukey's hinges* convention in which the median belongs to both
halves if the dataset has an odd number of elements."
(when (seq coll)
(let [v (vec (sort coll))
cnt (count v)
lower (subvec v 0 (quot (inc cnt) 2))
upper (subvec v (if (odd? cnt) (dec (count lower)) (count lower)))]
[(v 0)
(median-sorted-vec lower)
(median-sorted-vec v)
(median-sorted-vec upper)
(v (dec cnt))])))
(defn smoke-test-quartiles []
(assert (= (quartiles [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49])
[6 25.5 40 42.5 49]))
(assert (= (quartiles [7, 15, 36, 39, 40, 41])
[7 15 37.5 40 41]))
(assert (= (quartiles (range 0.0 10.1 0.5))
[0.0 2.5 5.0 7.5 10.0]))
(assert (= (quartiles [13])
[13 13 13 13 13]))
(assert (= (quartiles (range 100001))
[0 25000 50000 75000 100000]))
(assert (nil? (quartiles [])))
true)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment