rgrig/Median.py

## big.dense.log
N=10000001 M=1000
10.169694 10.154661 setup
2.906270 2.901401 statistics.median
result 499
15.317439 15.293849 quickselect
result 499
22.627862 22.582209 jukka_a sqrt
result 499
9.278614 9.260504 jukka_a 10
result 499
0.003925 0.003904 approx
result 500
0.856507 0.854239 smallints
result 499

## big.sparse.log
N=10000001 M=100000000
10.735744 10.718981 setup
8.204952 8.190202 statistics.median
result 49990589
9.003044 8.981585 quickselect
result 49990589
26.867112 26.827628 jukka_a sqrt
result 49990589
10.137187 10.121637 jukka_a 10
result 49990589
0.004148 0.004141 approx
result 49197494
0.168329 0.168003 smallints
result None

## huge.dense.log
N=100000001 M=10000
109.183333 108.971217 setup
47.319695 47.233222 statistics.median
result 5000
59.217462 59.105478 quickselect
result 5000
299.210840 298.685356 jukka_a sqrt
result 5000
103.222082 103.026732 jukka_a 10
result 5000
0.016545 0.016499 approx
result 4971
10.644286 10.624396 smallints
result 5000

## huge.sparse.log
N=100000001 M=1000000000
105.037921 104.859691 setup
118.957847 118.699387 statistics.median
result 499971415
90.686796 90.520070 quickselect
result 499971415
316.317084 315.730812 jukka_a sqrt
result 499971415
137.946631 137.679415 jukka_a 10
result 499971415
0.013352 0.013325 approx
result 498666184
1.686462 1.683115 smallints
result None

## Median.py
# NOTES:
#   - I didn't pay attention to what happens for lists of even length
#   - Since the data is random, the algorithms need not be randomized. I put
#     randomization in because calls to randrange/sample/etc might take
#     significant time, and it's not really OK to ignore that time.
#   - Since the data is random, Python's sort should be disadvantaged, at least
#     when M >> N. (The main idea of Timsort is that it exploits patterns
#     in data.)
#   - As expected, Timsort does better when N>M than when M>N. I didn't expect it
#     to bo _so_ much better, though.

from functools import partial
from random import randrange, sample, seed
from statistics import median
from time import perf_counter, process_time

import math
import sys

N = 100000001
M = 10000

Ta = None
Tb = None

def start():
    global Ta, Tb
    assert Ta is None
    Ta = perf_counter()
    Tb = process_time()

def stop(kind):
    global Ta, Tb
    assert Ta is not None
    sys.stdout.write('{:02f} {:02f} {}\n'.format(
        perf_counter() - Ta, process_time() - Tb, kind))
    Ta = Tb = None

def setup():
    start()
    seed(123)
    xs = [randrange(M) for _ in range(N)]
    stop('setup')
    return xs

def with_time(kind, f, xs):
    start()
    r = f(xs)
    stop(kind)
    sys.stdout.write('result {}\n'.format(r))

def quickselect(xs):
    a, b = 0, len(xs)
    m = (a + b) // 2
    while True:
        i = randrange(a, b)
        xs[a], xs[i] = xs[i], xs[a]
        i, j = a, a + 1
        for k in range(a + 1, b):
            if xs[k] < xs[i]:
                t = xs[i]
                xs[i] = xs[k]
                xs[k] = xs[j]
                xs[j] = t
                i += 1
                j += 1
            elif xs[k] == xs[i]:
                xs[j], xs[k] = xs[k], xs[j]
                j += 1
        if m < i:
            b = i
        elif j <= m:
            a = j
        else:
            return xs[m]


def jukka_a(k, xs):
    assert k >= 2
    thresholds = sample(xs, k - 1)
    thresholds = [float('-inf')] + sorted(set(thresholds)) + [float('inf')]
    k = len(thresholds) - 1
    bins = [0] * k
    for x in xs:
        a, b = 0, k + 1
        while a + 1 != b:
            m = (a + b) // 2
            if x < thresholds[m]:
                b = m
            else:
                a = m
        #print(x, 'in [',thresholds[a], ':', thresholds[b],')')
        bins[a] += 1
    k, s = 0, bins[0]
    m = len(xs) // 2
    while s <= m:
        k += 1
        s += bins[k]
    middle_bin = [x for x in xs if thresholds[k] <= x and x < thresholds[k+1]]
    middle_bin.sort()
    return middle_bin[m - s + bins[k]]

def approx(k, xs):
    ys = sample(xs, k)
    ys.sort()
    return ys[len(ys) // 2]

def smallints(xs):
    n = len(xs)
    m = 1 + max(xs)
    if m > 1000000:
        return None
    bins = [0] * m
    for x in xs:
        bins[x] += 1
    x, s = 0, bins[0]
    m = n // 2
    while s <= m:
        x += 1
        s += bins[x]
    return x

sys.stdout.write('N={} M={}\n'.format(N, M))
xs = setup()
with_time('statistics.median', median, list(xs))
with_time('quickselect', quickselect, list(xs))
with_time('jukka_a sqrt', partial(jukka_a, int(math.sqrt(N))), list(xs))
with_time('jukka_a 10', partial(jukka_a, 10), list(xs))
with_time('approx', partial(approx, int(math.sqrt(N))), list(xs))
with_time('smallints', smallints, list(xs))

# vim:sts=4:sw=4:
	N=10000001 M=1000
	10.169694 10.154661 setup
	2.906270 2.901401 statistics.median
	result 499
	15.317439 15.293849 quickselect
	result 499
	22.627862 22.582209 jukka_a sqrt
	result 499
	9.278614 9.260504 jukka_a 10
	result 499
	0.003925 0.003904 approx
	result 500
	0.856507 0.854239 smallints
	result 499
	N=10000001 M=100000000
	10.735744 10.718981 setup
	8.204952 8.190202 statistics.median
	result 49990589
	9.003044 8.981585 quickselect
	result 49990589
	26.867112 26.827628 jukka_a sqrt
	result 49990589
	10.137187 10.121637 jukka_a 10
	result 49990589
	0.004148 0.004141 approx
	result 49197494
	0.168329 0.168003 smallints
	result None
	N=100000001 M=10000
	109.183333 108.971217 setup
	47.319695 47.233222 statistics.median
	result 5000
	59.217462 59.105478 quickselect
	result 5000
	299.210840 298.685356 jukka_a sqrt
	result 5000
	103.222082 103.026732 jukka_a 10
	result 5000
	0.016545 0.016499 approx
	result 4971
	10.644286 10.624396 smallints
	result 5000
	N=100000001 M=1000000000
	105.037921 104.859691 setup
	118.957847 118.699387 statistics.median
	result 499971415
	90.686796 90.520070 quickselect
	result 499971415
	316.317084 315.730812 jukka_a sqrt
	result 499971415
	137.946631 137.679415 jukka_a 10
	result 499971415
	0.013352 0.013325 approx
	result 498666184
	1.686462 1.683115 smallints
	result None
	# NOTES:
	# - I didn't pay attention to what happens for lists of even length
	# - Since the data is random, the algorithms need not be randomized. I put
	# randomization in because calls to randrange/sample/etc might take
	# significant time, and it's not really OK to ignore that time.
	# - Since the data is random, Python's sort should be disadvantaged, at least
	# when M >> N. (The main idea of Timsort is that it exploits patterns
	# in data.)
	# - As expected, Timsort does better when N>M than when M>N. I didn't expect it
	# to bo _so_ much better, though.

	from functools import partial
	from random import randrange, sample, seed
	from statistics import median
	from time import perf_counter, process_time

	import math
	import sys

	N = 100000001
	M = 10000

	Ta = None
	Tb = None

	def start():
	global Ta, Tb
	assert Ta is None
	Ta = perf_counter()
	Tb = process_time()

	def stop(kind):
	global Ta, Tb
	assert Ta is not None
	sys.stdout.write('{:02f} {:02f} {}\n'.format(
	perf_counter() - Ta, process_time() - Tb, kind))
	Ta = Tb = None

	def setup():
	start()
	seed(123)
	xs = [randrange(M) for _ in range(N)]
	stop('setup')
	return xs

	def with_time(kind, f, xs):
	start()
	r = f(xs)
	stop(kind)
	sys.stdout.write('result {}\n'.format(r))

	def quickselect(xs):
	a, b = 0, len(xs)
	m = (a + b) // 2
	while True:
	i = randrange(a, b)
	xs[a], xs[i] = xs[i], xs[a]
	i, j = a, a + 1
	for k in range(a + 1, b):
	if xs[k] < xs[i]:
	t = xs[i]
	xs[i] = xs[k]
	xs[k] = xs[j]
	xs[j] = t
	i += 1
	j += 1
	elif xs[k] == xs[i]:
	xs[j], xs[k] = xs[k], xs[j]
	j += 1
	if m < i:
	b = i
	elif j <= m:
	a = j
	else:
	return xs[m]


	def jukka_a(k, xs):
	assert k >= 2
	thresholds = sample(xs, k - 1)
	thresholds = [float('-inf')] + sorted(set(thresholds)) + [float('inf')]
	k = len(thresholds) - 1
	bins = [0] * k
	for x in xs:
	a, b = 0, k + 1
	while a + 1 != b:
	m = (a + b) // 2
	if x < thresholds[m]:
	b = m
	else:
	a = m
	#print(x, 'in [',thresholds[a], ':', thresholds[b],')')
	bins[a] += 1
	k, s = 0, bins[0]
	m = len(xs) // 2
	while s <= m:
	k += 1
	s += bins[k]
	middle_bin = [x for x in xs if thresholds[k] <= x and x < thresholds[k+1]]
	middle_bin.sort()
	return middle_bin[m - s + bins[k]]

	def approx(k, xs):
	ys = sample(xs, k)
	ys.sort()
	return ys[len(ys) // 2]

	def smallints(xs):
	n = len(xs)
	m = 1 + max(xs)
	if m > 1000000:
	return None
	bins = [0] * m
	for x in xs:
	bins[x] += 1
	x, s = 0, bins[0]
	m = n // 2
	while s <= m:
	x += 1
	s += bins[x]
	return x

	sys.stdout.write('N={} M={}\n'.format(N, M))
	xs = setup()
	with_time('statistics.median', median, list(xs))
	with_time('quickselect', quickselect, list(xs))
	with_time('jukka_a sqrt', partial(jukka_a, int(math.sqrt(N))), list(xs))
	with_time('jukka_a 10', partial(jukka_a, 10), list(xs))
	with_time('approx', partial(approx, int(math.sqrt(N))), list(xs))
	with_time('smallints', smallints, list(xs))

	# vim:sts=4:sw=4: