Skip to content

Instantly share code, notes, and snippets.

@rgrig
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rgrig/d4dd95d75dba40e41694 to your computer and use it in GitHub Desktop.
Save rgrig/d4dd95d75dba40e41694 to your computer and use it in GitHub Desktop.
computing the median for a list of integers
N=10000001 M=1000
10.169694 10.154661 setup
2.906270 2.901401 statistics.median
result 499
15.317439 15.293849 quickselect
result 499
22.627862 22.582209 jukka_a sqrt
result 499
9.278614 9.260504 jukka_a 10
result 499
0.003925 0.003904 approx
result 500
0.856507 0.854239 smallints
result 499
N=10000001 M=100000000
10.735744 10.718981 setup
8.204952 8.190202 statistics.median
result 49990589
9.003044 8.981585 quickselect
result 49990589
26.867112 26.827628 jukka_a sqrt
result 49990589
10.137187 10.121637 jukka_a 10
result 49990589
0.004148 0.004141 approx
result 49197494
0.168329 0.168003 smallints
result None
N=100000001 M=10000
109.183333 108.971217 setup
47.319695 47.233222 statistics.median
result 5000
59.217462 59.105478 quickselect
result 5000
299.210840 298.685356 jukka_a sqrt
result 5000
103.222082 103.026732 jukka_a 10
result 5000
0.016545 0.016499 approx
result 4971
10.644286 10.624396 smallints
result 5000
N=100000001 M=1000000000
105.037921 104.859691 setup
118.957847 118.699387 statistics.median
result 499971415
90.686796 90.520070 quickselect
result 499971415
316.317084 315.730812 jukka_a sqrt
result 499971415
137.946631 137.679415 jukka_a 10
result 499971415
0.013352 0.013325 approx
result 498666184
1.686462 1.683115 smallints
result None
# NOTES:
# - I didn't pay attention to what happens for lists of even length
# - Since the data is random, the algorithms need not be randomized. I put
# randomization in because calls to randrange/sample/etc might take
# significant time, and it's not really OK to ignore that time.
# - Since the data is random, Python's sort should be disadvantaged, at least
# when M >> N. (The main idea of Timsort is that it exploits patterns
# in data.)
# - As expected, Timsort does better when N>M than when M>N. I didn't expect it
# to bo _so_ much better, though.
from functools import partial
from random import randrange, sample, seed
from statistics import median
from time import perf_counter, process_time
import math
import sys
N = 100000001
M = 10000
Ta = None
Tb = None
def start():
global Ta, Tb
assert Ta is None
Ta = perf_counter()
Tb = process_time()
def stop(kind):
global Ta, Tb
assert Ta is not None
sys.stdout.write('{:02f} {:02f} {}\n'.format(
perf_counter() - Ta, process_time() - Tb, kind))
Ta = Tb = None
def setup():
start()
seed(123)
xs = [randrange(M) for _ in range(N)]
stop('setup')
return xs
def with_time(kind, f, xs):
start()
r = f(xs)
stop(kind)
sys.stdout.write('result {}\n'.format(r))
def quickselect(xs):
a, b = 0, len(xs)
m = (a + b) // 2
while True:
i = randrange(a, b)
xs[a], xs[i] = xs[i], xs[a]
i, j = a, a + 1
for k in range(a + 1, b):
if xs[k] < xs[i]:
t = xs[i]
xs[i] = xs[k]
xs[k] = xs[j]
xs[j] = t
i += 1
j += 1
elif xs[k] == xs[i]:
xs[j], xs[k] = xs[k], xs[j]
j += 1
if m < i:
b = i
elif j <= m:
a = j
else:
return xs[m]
def jukka_a(k, xs):
assert k >= 2
thresholds = sample(xs, k - 1)
thresholds = [float('-inf')] + sorted(set(thresholds)) + [float('inf')]
k = len(thresholds) - 1
bins = [0] * k
for x in xs:
a, b = 0, k + 1
while a + 1 != b:
m = (a + b) // 2
if x < thresholds[m]:
b = m
else:
a = m
#print(x, 'in [',thresholds[a], ':', thresholds[b],')')
bins[a] += 1
k, s = 0, bins[0]
m = len(xs) // 2
while s <= m:
k += 1
s += bins[k]
middle_bin = [x for x in xs if thresholds[k] <= x and x < thresholds[k+1]]
middle_bin.sort()
return middle_bin[m - s + bins[k]]
def approx(k, xs):
ys = sample(xs, k)
ys.sort()
return ys[len(ys) // 2]
def smallints(xs):
n = len(xs)
m = 1 + max(xs)
if m > 1000000:
return None
bins = [0] * m
for x in xs:
bins[x] += 1
x, s = 0, bins[0]
m = n // 2
while s <= m:
x += 1
s += bins[x]
return x
sys.stdout.write('N={} M={}\n'.format(N, M))
xs = setup()
with_time('statistics.median', median, list(xs))
with_time('quickselect', quickselect, list(xs))
with_time('jukka_a sqrt', partial(jukka_a, int(math.sqrt(N))), list(xs))
with_time('jukka_a 10', partial(jukka_a, 10), list(xs))
with_time('approx', partial(approx, int(math.sqrt(N))), list(xs))
with_time('smallints', smallints, list(xs))
# vim:sts=4:sw=4:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment