Skip to content

Instantly share code, notes, and snippets.

@ceshine
Created July 31, 2015 04:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ceshine/ced4787bd41555b729de to your computer and use it in GitHub Desktop.
Save ceshine/ced4787bd41555b729de to your computer and use it in GitHub Desktop.
Script for Sampling from Stdin
#!/usr/bin/env python
# sample: Output lines from stdin to stdout with a given probability,
# for a given duration, and with a given delay between lines.
#
# Example usage: seq 100 | sample -r 20% -d 1000
#
# Dependency: Python 2.5
#
# Original Author: http://jeroenjanssens.com
# Original Script: https://github.com/jeroenjanssens/data-science-at-the-command-line/blob/master/tools/sample
# Modified by CeShine Lee
import os
import argparse
from random import random
from time import time, sleep
from sys import stdin, stdout
from datetime import datetime, timedelta
def total_seconds(delta):
return delta.seconds + (24 * 3600 * delta.days)
def main():
parser = argparse.ArgumentParser(description=(
"Output lines from stdin to stdout with a given probability "
"for a given duration, and with a given delay between lines."))
parser.add_argument('file', nargs='?', type=argparse.FileType('rb'),
default=stdin, help="File", metavar="FILE")
parser.add_argument('-W', '--weeks', type=float, default=0,
help="Duration of sampling in weeks")
parser.add_argument('-D', '--days', type=float, default=0,
help="Duration of sampling in days")
parser.add_argument('-H', '--hours', type=float, default=0,
help="Duration of sampling in hours")
parser.add_argument('-m', '--minutes', type=float, default=0,
help="Duration of sampling in minutes")
parser.add_argument('-s', '--seconds', type=float, default=0,
help="Duration of sampling in seconds")
parser.add_argument('-t', '--milliseconds', type=float, default=0,
help="Duration of sampling in milliseconds")
parser.add_argument('-u', '--microseconds', type=float, default=0,
help="Duration of sampling in microseconds")
parser.add_argument('-r', '--rate', default='100%',
help="Rate between 0 and 1 using either 0.33, 33%%, 1/3 notation.")
parser.add_argument('-d', '--delay', default=0, type=int,
help="Time in milliseconds between each line of output")
parser.add_argument('--header', action='store_true',
help="The file being sampled has a header")
args = parser.parse_args()
invalid_rate_msg = ("Invalid rate. Please specify a rate between 0"
" and 1 using either 0.33, 33%, 1/3 notation.")
try:
delay = float(args.delay) / 1000.0
except ValueError:
parser.error("Invalid delay. Please specify a delay in ms.")
try:
duration = total_seconds(timedelta(weeks=args.weeks, days=args.days,
hours=args.hours, minutes=args.minutes, seconds=args.seconds,
milliseconds=args.milliseconds, microseconds=args.microseconds))
except:
parser.error("Invalid duration.")
try:
if '%' in args.rate:
rate = float(args.rate[:-1]) / 100.0
elif '/' in args.rate:
a, b = map(float, args.rate.split('/')[:2])
rate = a / (1.0*b)
else:
rate = float(args.rate)
except ValueError:
parser.error(invalid_rate_msg)
if rate <= 0 or rate > 1:
parser.error(invalid_rate_msg)
start = time()
try:
if args.header:
line = args.file.readline()
if not line:
return
stdout.write(line)
while True:
line = args.file.readline()
if not line:
return
if random() <= rate:
stdout.write(line)
stdout.flush()
now = time()
if duration and (now-start) > duration:
return
sleep(delay)
except:
pass
if __name__ == "__main__":
exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment