kjhealy/gist:85f23c3ba158770ffa3ae09de2ef946a

## gistfile1.txt
## Take an approximately 0.1 percent sample of lines from this gzipped
## csv file. We do this by having gzip stream to STDOUT and then the
## Perl one-liner does the sampling. On an 80GB file, the output will
## be ~80MB. Strictly speaking this is only roughly a 1% sample.
## Also, there are a few possible edge cases with the rough-and-ready
## sampling method, but they're not that likely to worry us given what
## we want to do.
gzip -cd giantfile.csv.gz | perl -ne 'print if (rand() < .001)' > sample.csv

## Don't forget to put back the column names
gzip -cd giantfile.csv.gz | head -n 1 > header.csv
cat header.tsv sample.csv > tmp.tsv && mv -f tmp.csv sample.csv

## You could do the sampling with awk, too, but it will be a bit slower
# awk 'BEGIN {srand()} !/^$/ { if (rand() <= .001 || FNR==1) print $0}'
	## Take an approximately 0.1 percent sample of lines from this gzipped
	## csv file. We do this by having gzip stream to STDOUT and then the
	## Perl one-liner does the sampling. On an 80GB file, the output will
	## be ~80MB. Strictly speaking this is only roughly a 1% sample.
	## Also, there are a few possible edge cases with the rough-and-ready
	## sampling method, but they're not that likely to worry us given what
	## we want to do.
	gzip -cd giantfile.csv.gz \| perl -ne 'print if (rand() < .001)' > sample.csv

	## Don't forget to put back the column names
	gzip -cd giantfile.csv.gz \| head -n 1 > header.csv
	cat header.tsv sample.csv > tmp.tsv && mv -f tmp.csv sample.csv

	## You could do the sampling with awk, too, but it will be a bit slower
	# awk 'BEGIN {srand()} !/^$/ { if (rand() <= .001 \|\| FNR==1) print $0}'