darius/gist:173482

## gistfile1.py
"""
Would this make a better random.sample() function for the Python
standard library? Robert Floyd's algorithm as presented in Jon
Bentley, "A Sample of Brilliance".

Answer: no, it's slower; somewhat slower even if you tune it.
I tried a few different test cases -- both small and large
result lists. I'd expect it to use much less memory in some cases,
but my flailings turned up no case where that seemed to matter.

(But sample_set() seems consistently faster than random.sample().)

On the bright side, it's less code overall than the standard library's,
even after tuning.
"""

import random
from random import randint

## random.sample(range(1, 11), 3)
#. [1, 10, 2]

## sample(range(1, 11), 3)
#. [8, 1, 6]

## help(random.sample)
#. Help on method sample in module random:
#.
#. sample(self, population, k) method of random.Random instance
#.     Chooses k unique random elements from a population sequence.
#.
#.     Returns a new list containing elements from the population while
#.     leaving the original population unchanged.  The resulting list is
#.     in selection order so that all sub-slices will also be valid random
#.     samples.  This allows raffle winners (the sample) to be partitioned
#.     into grand prize and second place winners (the subslices).
#.
#.     Members of the population need not be hashable or unique.  If the
#.     population contains repeats, then each occurrence is a possible
#.     selection in the sample.
#.
#.     To choose a sample in a range of integers, use xrange as an argument.
#.     This is especially fast and space efficient for sampling from a
#.     large population:   sample(xrange(10000000), 60)
#.
#.

def sample(population, k):
    "Chooses k unique random elements from a population sequence."
    n = len(population)
    first, next = None, {}
    for j in range(n - k, n):
        t = randint(0, j)
        if t in next:
            next[j] = next[t]
            next[t] = j
        else:
            next[t] = first
            first = t
    return [population[i] for i in iterate(first, next.get)]

def iterate(x, f):
    while x is not None:
        yield x
        x = f(x)

def sample_set(population, k):
    "Like sample(), but returns a set instead of a list."
    # This code was tuned with some hacks from random.sample() -- _int, etc.
    n = len(population)
    _int = int
    s = set()
    s_add = s.add
    for j in xrange(n - k, n):
        t = _int(random() * j)
        s_add(j if t in s else t)
    return s

def benchmark(module_name, function_name, n, k, ntimes):
    import timeit
    t = timeit.Timer(setup='from %s import %s' % (module_name, function_name),
                     stmt='%s(xrange(%d), %d)' % (function_name, n, k))
    return t.timeit(ntimes)

# sample.benchmark('random', 'sample', 100000000, 10000, 100)
	"""
	Would this make a better random.sample() function for the Python
	standard library? Robert Floyd's algorithm as presented in Jon
	Bentley, "A Sample of Brilliance".

	Answer: no, it's slower; somewhat slower even if you tune it.
	I tried a few different test cases -- both small and large
	result lists. I'd expect it to use much less memory in some cases,
	but my flailings turned up no case where that seemed to matter.

	(But sample_set() seems consistently faster than random.sample().)

	On the bright side, it's less code overall than the standard library's,
	even after tuning.
	"""

	import random
	from random import randint

	## random.sample(range(1, 11), 3)
	#. [1, 10, 2]

	## sample(range(1, 11), 3)
	#. [8, 1, 6]

	## help(random.sample)
	#. Help on method sample in module random:
	#.
	#. sample(self, population, k) method of random.Random instance
	#. Chooses k unique random elements from a population sequence.
	#.
	#. Returns a new list containing elements from the population while
	#. leaving the original population unchanged. The resulting list is
	#. in selection order so that all sub-slices will also be valid random
	#. samples. This allows raffle winners (the sample) to be partitioned
	#. into grand prize and second place winners (the subslices).
	#.
	#. Members of the population need not be hashable or unique. If the
	#. population contains repeats, then each occurrence is a possible
	#. selection in the sample.
	#.
	#. To choose a sample in a range of integers, use xrange as an argument.
	#. This is especially fast and space efficient for sampling from a
	#. large population: sample(xrange(10000000), 60)
	#.
	#.

	def sample(population, k):
	"Chooses k unique random elements from a population sequence."
	n = len(population)
	first, next = None, {}
	for j in range(n - k, n):
	t = randint(0, j)
	if t in next:
	next[j] = next[t]
	next[t] = j
	else:
	next[t] = first
	first = t
	return [population[i] for i in iterate(first, next.get)]

	def iterate(x, f):
	while x is not None:
	yield x
	x = f(x)

	def sample_set(population, k):
	"Like sample(), but returns a set instead of a list."
	# This code was tuned with some hacks from random.sample() -- _int, etc.
	n = len(population)
	_int = int
	s = set()
	s_add = s.add
	for j in xrange(n - k, n):
	t = _int(random() * j)
	s_add(j if t in s else t)
	return s

	def benchmark(module_name, function_name, n, k, ntimes):
	import timeit
	t = timeit.Timer(setup='from %s import %s' % (module_name, function_name),
	stmt='%s(xrange(%d), %d)' % (function_name, n, k))
	return t.timeit(ntimes)

	# sample.benchmark('random', 'sample', 100000000, 10000, 100)