Last active
June 7, 2017 19:50
-
-
Save bbengfort/9ca2821d66e2a0f1316f3986fbcef8e5 to your computer and use it in GitHub Desktop.
compression benchmarks for directories with many small files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Benchmarks for compression -- data generation and runner | |
""" | |
########################################################################## | |
## Imports | |
########################################################################## | |
import os | |
import csv | |
import math | |
import time | |
import lorem | |
import tempfile | |
import argparse | |
import subprocess | |
from copy import copy | |
from shutil import rmtree | |
from datetime import date | |
########################################################################## | |
## Helpers | |
########################################################################## | |
MiB = 1.049e+6 | |
GZIP = ["tar", "-zcf"] | |
GUNZIP = ["tar", "-zxf"] | |
BZIP2 = ["tar", "-jcf"] | |
BUNZIP2 = ["tar", "-jxf"] | |
ZIP = ["zip", "-rqX"] | |
UNZIP = ["unzip", "-q", "-d"] | |
def dirstats(path): | |
size, count = 0, 0 | |
for entry in os.scandir(path): | |
if entry.is_file(): | |
size += entry.stat().st_size | |
count += 1 | |
elif entry.is_dir(): | |
s, c = dirstats(entry.path) | |
size += s | |
count += c | |
return size, count | |
def gentext(size): | |
text = "" | |
while len(text) <= size: | |
text += lorem.paragraph() + "\n\n" | |
return text[:size] | |
def compress(src, dst, algorithm="gzip"): | |
# Use the appropriate extension for the algorithm | |
path, _ = os.path.splitext(dst) | |
path += { | |
'gzip': ".tgz", | |
'bzip2': ".tbz2", | |
'zip': ".zip", | |
}[algorithm] | |
# Get the correct args | |
args = { | |
'gzip': GZIP, | |
'bzip2': BZIP2, | |
'zip': ZIP, | |
}[algorithm] + [path, src] | |
# Run the compression and time it. | |
start = time.time() | |
subprocess.run(args) | |
# Return the path and the time delta | |
return path, time.time() - start | |
def extract(src, dst=None, algorithm="gzip"): | |
# If dst is None, extract to same dir as src | |
dst = dst or os.path.dirname(src) | |
# Get the correct args | |
args = { | |
'gzip': GUNZIP, | |
'bzip2': BUNZIP2, | |
'zip': UNZIP, | |
}[algorithm] | |
if algorithm == "zip": | |
args += [dst, src] | |
else: | |
args += [src, "-C", dst] | |
# Run the compression and time it. | |
start = time.time() | |
subprocess.run(args) | |
# Return the path and the time delta | |
return dst, time.time() - start | |
########################################################################## | |
## Primary Utilities | |
########################################################################## | |
def generate(args): | |
start = time.time() | |
path = os.path.normpath(args.dir) | |
# Delete any existing data | |
if os.path.exists(path): | |
print("deleting current directory: {}".format(path)) | |
rmtree(path) | |
# Make the data directory | |
os.mkdir(path) | |
# Create the number of subdirectories required | |
ndirs = math.ceil(args.files/args.divide) | |
ndate = date.today().strftime("%Y%m%d") | |
subdirs = [ | |
os.path.join(path, "{}{:03d}".format(ndate, idx)) | |
for idx in range(1, ndirs+1) | |
] | |
for subdir in subdirs: | |
os.mkdir(subdir) | |
# Create the data in the subdirectories | |
for idx in range(args.files): | |
sdir = subdirs[idx % ndirs] | |
fpath = os.path.join(sdir, "ft{:04d}".format(idx+1)) | |
with open(fpath, 'w') as fobj: | |
text = gentext(int(args.size * MiB)) | |
fobj.write(text) | |
print("wrote {} {}MiB files to {} subdirectories in {} ({:0.3f} seconds)".format( | |
args.files, args.size, ndirs, path, time.time() - start | |
)) | |
def benchmark(args): | |
# Run all benchmarks sequentially | |
if args.compress == "all": | |
cargs = copy(args) | |
for algorithm in ("gzip", "bzip2", "zip"): | |
cargs.compress = algorithm | |
benchmark(cargs) | |
return | |
# Primary benchmark code; get temp working directory | |
tdir = tempfile.mkdtemp("-benchmark", "compress-") | |
# Get the original size and number of files | |
osize, nfiles = dirstats(args.dir) | |
# First compress the data directory to a tempdir | |
arc, ctm = compress(args.dir, os.path.join(tdir, "archive"), args.compress) | |
# Compute the compression size | |
asize = os.path.getsize(arc) | |
# Now extract the compressed archive to the tempdir | |
fld, etm = extract(arc, algorithm=args.compress) | |
# Save results | |
with open(args.results, 'a') as f: | |
writer = csv.writer(f) | |
# algorithm,osize,nfiles,fsize,compress,extract,asize,percent | |
writer.writerow([ | |
args.compress, osize, nfiles, osize/nfiles, | |
ctm, etm, asize, (asize / osize) * 100, | |
]) | |
# Cleanup the temporary directory | |
rmtree(tdir) | |
# Print result | |
print(( | |
"{} compressed {} files ({:0.0f}MiB) in {:0.3f} seconds " | |
"({:0.1f}% compression) extracted in {:0.3f} seconds." | |
).format( | |
args.compress, nfiles, osize/MiB, ctm, (asize / osize) * 100, etm | |
)) | |
########################################################################## | |
## Main Method | |
########################################################################## | |
if __name__ == '__main__': | |
# Create the parser and subparser | |
parser = argparse.ArgumentParser( | |
description="utility to run benchmarks on compression libraries", | |
epilog="for more, see the blog post on bbengfort.github.io" | |
) | |
subparsers = parser.add_subparsers( | |
title="commands", | |
description="benchmarking utilities" | |
) | |
# Add the commands and arguments | |
gp = subparsers.add_parser('generate', description="generate data for benchmark") | |
gp.add_argument('-d', '--divide', metavar="N", type=int, default=10, help="number of files per subdirectory") | |
gp.add_argument('-n', '--files', metavar="N", type=int, default=50, help="number of files to generate") | |
gp.add_argument('-s', '--size', metavar="MiB", type=int, default=1, help="approximate size in MiB of each file") | |
gp.add_argument('dir', nargs="?", type=str, default='data', help="directory to wipe and fill with benchmark data") | |
gp.set_defaults(func=generate) | |
bp = subparsers.add_parser('benchmark', description="run the benchmark for compression") | |
bp.add_argument('-r', '--results', metavar="CSV", type=str, default="results.csv", help="location to append results to") | |
bp.add_argument('-c', '--compress', default="gzip", metavar="ZIP", choices={"gzip", "bzip2", "zip", "all"}, help="the compression algorithm to benchmark") | |
bp.add_argument('dir', nargs="?", type=str, default='data', help="directory to run the compression benchmark on") | |
bp.set_defaults(func=benchmark) | |
# Run the parser | |
args = parser.parse_args() | |
args.func(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment