Created
June 24, 2016 20:49
-
-
Save poliquin/4134f3b3e075112db9cf70508c06993f to your computer and use it in GitHub Desktop.
Reservoir sampling of file or stdin in Python 2/3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import random | |
from itertools import islice | |
""" | |
Reservoir sampling of input file/stream. Handles header rows. | |
""" | |
def main(n, infile=None, header=False, ignore=0, seed=None): | |
"""Create reservoir sample from input. | |
Args: | |
n (int): Sample size, number of lines. | |
Kwargs: | |
infile (str): Path to input, defaults to stdin. | |
header (bool): Input contains a header line. | |
ignore (int): Number of lines to ignore at start. | |
seed (object): Seed for random, can be any hashable object. | |
The ignore option is applied before reader the header. You might | |
use ignore when reading a data file that contains documentation, | |
followed by a header row, followed by records. | |
""" | |
# determine input source | |
if infile is None: | |
infile = sys.stdin | |
else: | |
infile = open(infile, 'r') | |
# ignore beginning lines | |
if ignore > 0: | |
list(islice(infile, ignore)) | |
# optional header row | |
hdr = next(infile) if header else None | |
# pre-populate the sample with first n lines | |
sample = list(islice(infile, n)) | |
random.seed(seed) | |
for idx, row in enumerate(infile, start=n): | |
if random.random() < n / (idx + 1.0): | |
repl = random.randint(0, n - 1) | |
sample[repl] = row | |
infile.close() | |
# write sample results | |
if hdr: | |
sys.stdout.write(hdr) | |
for row in sample: | |
sys.stdout.write(row) | |
if __name__ == '__main__': | |
import os | |
import argparse | |
argp = argparse.ArgumentParser(description='Reservoir sampling') | |
argp.add_argument('n', type=int, help='Number of lines to sample') | |
argp.add_argument('infile', nargs='?', help='Input source') | |
argp.add_argument('-f', '--first', action='store_true', help='Has header') | |
argp.add_argument('-i', '--ignore', type=int, default=0, | |
help='Ignore first # lines') | |
argp.add_argument('-s', '--seed', help='Seed for random number generator') | |
opts = argp.parse_args() | |
# check that file is readable | |
if opts.infile is not None and not os.access(opts.infile, os.R_OK): | |
raise IOError('Cannot read file {}'.format(opts.infile)) | |
# see if seed is an integer | |
try: | |
opts.seed = int(opts.seed) | |
except (ValueError, TypeError): | |
# seed can be any hashable object, so nothing wrong with string | |
pass | |
main(opts.n, opts.infile, opts.first, opts.ignore, opts.seed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment