Created
July 31, 2015 04:48
-
-
Save ceshine/ced4787bd41555b729de to your computer and use it in GitHub Desktop.
Script for Sampling from Stdin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# sample: Output lines from stdin to stdout with a given probability, | |
# for a given duration, and with a given delay between lines. | |
# | |
# Example usage: seq 100 | sample -r 20% -d 1000 | |
# | |
# Dependency: Python 2.5 | |
# | |
# Original Author: http://jeroenjanssens.com | |
# Original Script: https://github.com/jeroenjanssens/data-science-at-the-command-line/blob/master/tools/sample | |
# Modified by CeShine Lee | |
import os | |
import argparse | |
from random import random | |
from time import time, sleep | |
from sys import stdin, stdout | |
from datetime import datetime, timedelta | |
def total_seconds(delta): | |
return delta.seconds + (24 * 3600 * delta.days) | |
def main(): | |
parser = argparse.ArgumentParser(description=( | |
"Output lines from stdin to stdout with a given probability " | |
"for a given duration, and with a given delay between lines.")) | |
parser.add_argument('file', nargs='?', type=argparse.FileType('rb'), | |
default=stdin, help="File", metavar="FILE") | |
parser.add_argument('-W', '--weeks', type=float, default=0, | |
help="Duration of sampling in weeks") | |
parser.add_argument('-D', '--days', type=float, default=0, | |
help="Duration of sampling in days") | |
parser.add_argument('-H', '--hours', type=float, default=0, | |
help="Duration of sampling in hours") | |
parser.add_argument('-m', '--minutes', type=float, default=0, | |
help="Duration of sampling in minutes") | |
parser.add_argument('-s', '--seconds', type=float, default=0, | |
help="Duration of sampling in seconds") | |
parser.add_argument('-t', '--milliseconds', type=float, default=0, | |
help="Duration of sampling in milliseconds") | |
parser.add_argument('-u', '--microseconds', type=float, default=0, | |
help="Duration of sampling in microseconds") | |
parser.add_argument('-r', '--rate', default='100%', | |
help="Rate between 0 and 1 using either 0.33, 33%%, 1/3 notation.") | |
parser.add_argument('-d', '--delay', default=0, type=int, | |
help="Time in milliseconds between each line of output") | |
parser.add_argument('--header', action='store_true', | |
help="The file being sampled has a header") | |
args = parser.parse_args() | |
invalid_rate_msg = ("Invalid rate. Please specify a rate between 0" | |
" and 1 using either 0.33, 33%, 1/3 notation.") | |
try: | |
delay = float(args.delay) / 1000.0 | |
except ValueError: | |
parser.error("Invalid delay. Please specify a delay in ms.") | |
try: | |
duration = total_seconds(timedelta(weeks=args.weeks, days=args.days, | |
hours=args.hours, minutes=args.minutes, seconds=args.seconds, | |
milliseconds=args.milliseconds, microseconds=args.microseconds)) | |
except: | |
parser.error("Invalid duration.") | |
try: | |
if '%' in args.rate: | |
rate = float(args.rate[:-1]) / 100.0 | |
elif '/' in args.rate: | |
a, b = map(float, args.rate.split('/')[:2]) | |
rate = a / (1.0*b) | |
else: | |
rate = float(args.rate) | |
except ValueError: | |
parser.error(invalid_rate_msg) | |
if rate <= 0 or rate > 1: | |
parser.error(invalid_rate_msg) | |
start = time() | |
try: | |
if args.header: | |
line = args.file.readline() | |
if not line: | |
return | |
stdout.write(line) | |
while True: | |
line = args.file.readline() | |
if not line: | |
return | |
if random() <= rate: | |
stdout.write(line) | |
stdout.flush() | |
now = time() | |
if duration and (now-start) > duration: | |
return | |
sleep(delay) | |
except: | |
pass | |
if __name__ == "__main__": | |
exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment