Skip to content

Instantly share code, notes, and snippets.

@bbengfort
Last active June 7, 2017 19:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bbengfort/9ca2821d66e2a0f1316f3986fbcef8e5 to your computer and use it in GitHub Desktop.
Save bbengfort/9ca2821d66e2a0f1316f3986fbcef8e5 to your computer and use it in GitHub Desktop.
compression benchmarks for directories with many small files
#!/usr/bin/env python3
"""
Benchmarks for compression -- data generation and runner
"""
##########################################################################
## Imports
##########################################################################
import os
import csv
import math
import time
import lorem
import tempfile
import argparse
import subprocess
from copy import copy
from shutil import rmtree
from datetime import date
##########################################################################
## Helpers
##########################################################################
MiB = 1.049e+6
GZIP = ["tar", "-zcf"]
GUNZIP = ["tar", "-zxf"]
BZIP2 = ["tar", "-jcf"]
BUNZIP2 = ["tar", "-jxf"]
ZIP = ["zip", "-rqX"]
UNZIP = ["unzip", "-q", "-d"]
def dirstats(path):
size, count = 0, 0
for entry in os.scandir(path):
if entry.is_file():
size += entry.stat().st_size
count += 1
elif entry.is_dir():
s, c = dirstats(entry.path)
size += s
count += c
return size, count
def gentext(size):
text = ""
while len(text) <= size:
text += lorem.paragraph() + "\n\n"
return text[:size]
def compress(src, dst, algorithm="gzip"):
# Use the appropriate extension for the algorithm
path, _ = os.path.splitext(dst)
path += {
'gzip': ".tgz",
'bzip2': ".tbz2",
'zip': ".zip",
}[algorithm]
# Get the correct args
args = {
'gzip': GZIP,
'bzip2': BZIP2,
'zip': ZIP,
}[algorithm] + [path, src]
# Run the compression and time it.
start = time.time()
subprocess.run(args)
# Return the path and the time delta
return path, time.time() - start
def extract(src, dst=None, algorithm="gzip"):
# If dst is None, extract to same dir as src
dst = dst or os.path.dirname(src)
# Get the correct args
args = {
'gzip': GUNZIP,
'bzip2': BUNZIP2,
'zip': UNZIP,
}[algorithm]
if algorithm == "zip":
args += [dst, src]
else:
args += [src, "-C", dst]
# Run the compression and time it.
start = time.time()
subprocess.run(args)
# Return the path and the time delta
return dst, time.time() - start
##########################################################################
## Primary Utilities
##########################################################################
def generate(args):
start = time.time()
path = os.path.normpath(args.dir)
# Delete any existing data
if os.path.exists(path):
print("deleting current directory: {}".format(path))
rmtree(path)
# Make the data directory
os.mkdir(path)
# Create the number of subdirectories required
ndirs = math.ceil(args.files/args.divide)
ndate = date.today().strftime("%Y%m%d")
subdirs = [
os.path.join(path, "{}{:03d}".format(ndate, idx))
for idx in range(1, ndirs+1)
]
for subdir in subdirs:
os.mkdir(subdir)
# Create the data in the subdirectories
for idx in range(args.files):
sdir = subdirs[idx % ndirs]
fpath = os.path.join(sdir, "ft{:04d}".format(idx+1))
with open(fpath, 'w') as fobj:
text = gentext(int(args.size * MiB))
fobj.write(text)
print("wrote {} {}MiB files to {} subdirectories in {} ({:0.3f} seconds)".format(
args.files, args.size, ndirs, path, time.time() - start
))
def benchmark(args):
# Run all benchmarks sequentially
if args.compress == "all":
cargs = copy(args)
for algorithm in ("gzip", "bzip2", "zip"):
cargs.compress = algorithm
benchmark(cargs)
return
# Primary benchmark code; get temp working directory
tdir = tempfile.mkdtemp("-benchmark", "compress-")
# Get the original size and number of files
osize, nfiles = dirstats(args.dir)
# First compress the data directory to a tempdir
arc, ctm = compress(args.dir, os.path.join(tdir, "archive"), args.compress)
# Compute the compression size
asize = os.path.getsize(arc)
# Now extract the compressed archive to the tempdir
fld, etm = extract(arc, algorithm=args.compress)
# Save results
with open(args.results, 'a') as f:
writer = csv.writer(f)
# algorithm,osize,nfiles,fsize,compress,extract,asize,percent
writer.writerow([
args.compress, osize, nfiles, osize/nfiles,
ctm, etm, asize, (asize / osize) * 100,
])
# Cleanup the temporary directory
rmtree(tdir)
# Print result
print((
"{} compressed {} files ({:0.0f}MiB) in {:0.3f} seconds "
"({:0.1f}% compression) extracted in {:0.3f} seconds."
).format(
args.compress, nfiles, osize/MiB, ctm, (asize / osize) * 100, etm
))
##########################################################################
## Main Method
##########################################################################
if __name__ == '__main__':
# Create the parser and subparser
parser = argparse.ArgumentParser(
description="utility to run benchmarks on compression libraries",
epilog="for more, see the blog post on bbengfort.github.io"
)
subparsers = parser.add_subparsers(
title="commands",
description="benchmarking utilities"
)
# Add the commands and arguments
gp = subparsers.add_parser('generate', description="generate data for benchmark")
gp.add_argument('-d', '--divide', metavar="N", type=int, default=10, help="number of files per subdirectory")
gp.add_argument('-n', '--files', metavar="N", type=int, default=50, help="number of files to generate")
gp.add_argument('-s', '--size', metavar="MiB", type=int, default=1, help="approximate size in MiB of each file")
gp.add_argument('dir', nargs="?", type=str, default='data', help="directory to wipe and fill with benchmark data")
gp.set_defaults(func=generate)
bp = subparsers.add_parser('benchmark', description="run the benchmark for compression")
bp.add_argument('-r', '--results', metavar="CSV", type=str, default="results.csv", help="location to append results to")
bp.add_argument('-c', '--compress', default="gzip", metavar="ZIP", choices={"gzip", "bzip2", "zip", "all"}, help="the compression algorithm to benchmark")
bp.add_argument('dir', nargs="?", type=str, default='data', help="directory to run the compression benchmark on")
bp.set_defaults(func=benchmark)
# Run the parser
args = parser.parse_args()
args.func(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment