Last active
August 29, 2015 14:11
-
-
Save rgrig/d4dd95d75dba40e41694 to your computer and use it in GitHub Desktop.
computing the median for a list of integers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
N=10000001 M=1000 | |
10.169694 10.154661 setup | |
2.906270 2.901401 statistics.median | |
result 499 | |
15.317439 15.293849 quickselect | |
result 499 | |
22.627862 22.582209 jukka_a sqrt | |
result 499 | |
9.278614 9.260504 jukka_a 10 | |
result 499 | |
0.003925 0.003904 approx | |
result 500 | |
0.856507 0.854239 smallints | |
result 499 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
N=10000001 M=100000000 | |
10.735744 10.718981 setup | |
8.204952 8.190202 statistics.median | |
result 49990589 | |
9.003044 8.981585 quickselect | |
result 49990589 | |
26.867112 26.827628 jukka_a sqrt | |
result 49990589 | |
10.137187 10.121637 jukka_a 10 | |
result 49990589 | |
0.004148 0.004141 approx | |
result 49197494 | |
0.168329 0.168003 smallints | |
result None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
N=100000001 M=10000 | |
109.183333 108.971217 setup | |
47.319695 47.233222 statistics.median | |
result 5000 | |
59.217462 59.105478 quickselect | |
result 5000 | |
299.210840 298.685356 jukka_a sqrt | |
result 5000 | |
103.222082 103.026732 jukka_a 10 | |
result 5000 | |
0.016545 0.016499 approx | |
result 4971 | |
10.644286 10.624396 smallints | |
result 5000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
N=100000001 M=1000000000 | |
105.037921 104.859691 setup | |
118.957847 118.699387 statistics.median | |
result 499971415 | |
90.686796 90.520070 quickselect | |
result 499971415 | |
316.317084 315.730812 jukka_a sqrt | |
result 499971415 | |
137.946631 137.679415 jukka_a 10 | |
result 499971415 | |
0.013352 0.013325 approx | |
result 498666184 | |
1.686462 1.683115 smallints | |
result None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NOTES: | |
# - I didn't pay attention to what happens for lists of even length | |
# - Since the data is random, the algorithms need not be randomized. I put | |
# randomization in because calls to randrange/sample/etc might take | |
# significant time, and it's not really OK to ignore that time. | |
# - Since the data is random, Python's sort should be disadvantaged, at least | |
# when M >> N. (The main idea of Timsort is that it exploits patterns | |
# in data.) | |
# - As expected, Timsort does better when N>M than when M>N. I didn't expect it | |
# to bo _so_ much better, though. | |
from functools import partial | |
from random import randrange, sample, seed | |
from statistics import median | |
from time import perf_counter, process_time | |
import math | |
import sys | |
N = 100000001 | |
M = 10000 | |
Ta = None | |
Tb = None | |
def start(): | |
global Ta, Tb | |
assert Ta is None | |
Ta = perf_counter() | |
Tb = process_time() | |
def stop(kind): | |
global Ta, Tb | |
assert Ta is not None | |
sys.stdout.write('{:02f} {:02f} {}\n'.format( | |
perf_counter() - Ta, process_time() - Tb, kind)) | |
Ta = Tb = None | |
def setup(): | |
start() | |
seed(123) | |
xs = [randrange(M) for _ in range(N)] | |
stop('setup') | |
return xs | |
def with_time(kind, f, xs): | |
start() | |
r = f(xs) | |
stop(kind) | |
sys.stdout.write('result {}\n'.format(r)) | |
def quickselect(xs): | |
a, b = 0, len(xs) | |
m = (a + b) // 2 | |
while True: | |
i = randrange(a, b) | |
xs[a], xs[i] = xs[i], xs[a] | |
i, j = a, a + 1 | |
for k in range(a + 1, b): | |
if xs[k] < xs[i]: | |
t = xs[i] | |
xs[i] = xs[k] | |
xs[k] = xs[j] | |
xs[j] = t | |
i += 1 | |
j += 1 | |
elif xs[k] == xs[i]: | |
xs[j], xs[k] = xs[k], xs[j] | |
j += 1 | |
if m < i: | |
b = i | |
elif j <= m: | |
a = j | |
else: | |
return xs[m] | |
def jukka_a(k, xs): | |
assert k >= 2 | |
thresholds = sample(xs, k - 1) | |
thresholds = [float('-inf')] + sorted(set(thresholds)) + [float('inf')] | |
k = len(thresholds) - 1 | |
bins = [0] * k | |
for x in xs: | |
a, b = 0, k + 1 | |
while a + 1 != b: | |
m = (a + b) // 2 | |
if x < thresholds[m]: | |
b = m | |
else: | |
a = m | |
#print(x, 'in [',thresholds[a], ':', thresholds[b],')') | |
bins[a] += 1 | |
k, s = 0, bins[0] | |
m = len(xs) // 2 | |
while s <= m: | |
k += 1 | |
s += bins[k] | |
middle_bin = [x for x in xs if thresholds[k] <= x and x < thresholds[k+1]] | |
middle_bin.sort() | |
return middle_bin[m - s + bins[k]] | |
def approx(k, xs): | |
ys = sample(xs, k) | |
ys.sort() | |
return ys[len(ys) // 2] | |
def smallints(xs): | |
n = len(xs) | |
m = 1 + max(xs) | |
if m > 1000000: | |
return None | |
bins = [0] * m | |
for x in xs: | |
bins[x] += 1 | |
x, s = 0, bins[0] | |
m = n // 2 | |
while s <= m: | |
x += 1 | |
s += bins[x] | |
return x | |
sys.stdout.write('N={} M={}\n'.format(N, M)) | |
xs = setup() | |
with_time('statistics.median', median, list(xs)) | |
with_time('quickselect', quickselect, list(xs)) | |
with_time('jukka_a sqrt', partial(jukka_a, int(math.sqrt(N))), list(xs)) | |
with_time('jukka_a 10', partial(jukka_a, 10), list(xs)) | |
with_time('approx', partial(approx, int(math.sqrt(N))), list(xs)) | |
with_time('smallints', smallints, list(xs)) | |
# vim:sts=4:sw=4: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment