Skip to content

Instantly share code, notes, and snippets.

@stantonk
Created June 13, 2014 21:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stantonk/49536e4c6729bab0d816 to your computer and use it in GitHub Desktop.
Save stantonk/49536e4c6729bab0d816 to your computer and use it in GitHub Desktop.
Randomly sample a large ass file
import argparse
import mmap
import random
def get_line_count(filename):
linecount = 0
with open(args.filename, "r+b") as f:
mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
try:
for line in iter(mm.readline, ""):
linecount += 1
finally:
mm.close()
return linecount
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("filename", type=str, help="name of file to sample")
parser.add_argument("size", type=int, help="sample size")
args = parser.parse_args()
linecount = get_line_count(args.filename)
lines_to_grab = set(random.sample(xrange(linecount), args.size))
with open(args.filename, 'r', 1) as f, open('outfile.csv', 'w') as of:
for l, line in enumerate(f):
if l in lines_to_grab:
of.write(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment