Skip to content

Instantly share code, notes, and snippets.

@poliquin
Created June 24, 2016 20:49
Show Gist options
  • Save poliquin/4134f3b3e075112db9cf70508c06993f to your computer and use it in GitHub Desktop.
Save poliquin/4134f3b3e075112db9cf70508c06993f to your computer and use it in GitHub Desktop.
Reservoir sampling of file or stdin in Python 2/3
#!/usr/bin/env python
import sys
import random
from itertools import islice
"""
Reservoir sampling of input file/stream. Handles header rows.
"""
def main(n, infile=None, header=False, ignore=0, seed=None):
"""Create reservoir sample from input.
Args:
n (int): Sample size, number of lines.
Kwargs:
infile (str): Path to input, defaults to stdin.
header (bool): Input contains a header line.
ignore (int): Number of lines to ignore at start.
seed (object): Seed for random, can be any hashable object.
The ignore option is applied before reader the header. You might
use ignore when reading a data file that contains documentation,
followed by a header row, followed by records.
"""
# determine input source
if infile is None:
infile = sys.stdin
else:
infile = open(infile, 'r')
# ignore beginning lines
if ignore > 0:
list(islice(infile, ignore))
# optional header row
hdr = next(infile) if header else None
# pre-populate the sample with first n lines
sample = list(islice(infile, n))
random.seed(seed)
for idx, row in enumerate(infile, start=n):
if random.random() < n / (idx + 1.0):
repl = random.randint(0, n - 1)
sample[repl] = row
infile.close()
# write sample results
if hdr:
sys.stdout.write(hdr)
for row in sample:
sys.stdout.write(row)
if __name__ == '__main__':
import os
import argparse
argp = argparse.ArgumentParser(description='Reservoir sampling')
argp.add_argument('n', type=int, help='Number of lines to sample')
argp.add_argument('infile', nargs='?', help='Input source')
argp.add_argument('-f', '--first', action='store_true', help='Has header')
argp.add_argument('-i', '--ignore', type=int, default=0,
help='Ignore first # lines')
argp.add_argument('-s', '--seed', help='Seed for random number generator')
opts = argp.parse_args()
# check that file is readable
if opts.infile is not None and not os.access(opts.infile, os.R_OK):
raise IOError('Cannot read file {}'.format(opts.infile))
# see if seed is an integer
try:
opts.seed = int(opts.seed)
except (ValueError, TypeError):
# seed can be any hashable object, so nothing wrong with string
pass
main(opts.n, opts.infile, opts.first, opts.ignore, opts.seed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment