bbengfort/zipbench.py

## zipbench.py
#!/usr/bin/env python3
"""
Benchmarks for compression -- data generation and runner
"""

##########################################################################
## Imports
##########################################################################

import os
import csv
import math
import time
import lorem
import tempfile
import argparse
import subprocess

from copy import copy
from shutil import rmtree
from datetime import date


##########################################################################
## Helpers
##########################################################################

MiB     = 1.049e+6
GZIP    = ["tar", "-zcf"]
GUNZIP  = ["tar", "-zxf"]
BZIP2   = ["tar", "-jcf"]
BUNZIP2 = ["tar", "-jxf"]
ZIP     = ["zip", "-rqX"]
UNZIP   = ["unzip", "-q", "-d"]


def dirstats(path):
    size, count = 0, 0
    for entry in os.scandir(path):
        if entry.is_file():
            size += entry.stat().st_size
            count += 1
        elif entry.is_dir():
            s, c = dirstats(entry.path)
            size += s
            count += c
    return size, count


def gentext(size):
    text = ""
    while len(text) <= size:
        text += lorem.paragraph() + "\n\n"
    return text[:size]


def compress(src, dst, algorithm="gzip"):
    # Use the appropriate extension for the algorithm
    path, _ = os.path.splitext(dst)
    path += {
        'gzip': ".tgz",
        'bzip2': ".tbz2",
        'zip': ".zip",
    }[algorithm]

    # Get the correct args
    args = {
        'gzip': GZIP,
        'bzip2': BZIP2,
        'zip': ZIP,
    }[algorithm] + [path, src]

    # Run the compression and time it.
    start = time.time()
    subprocess.run(args)

    # Return the path and the time delta
    return path, time.time() - start


def extract(src, dst=None, algorithm="gzip"):
    # If dst is None, extract to same dir as src
    dst = dst or os.path.dirname(src)

    # Get the correct args
    args = {
        'gzip': GUNZIP,
        'bzip2': BUNZIP2,
        'zip': UNZIP,
    }[algorithm]

    if algorithm == "zip":
        args += [dst, src]
    else:
        args += [src, "-C", dst]

    # Run the compression and time it.
    start = time.time()
    subprocess.run(args)

    # Return the path and the time delta
    return dst, time.time() - start


##########################################################################
## Primary Utilities
##########################################################################

def generate(args):
    start = time.time()
    path  = os.path.normpath(args.dir)

    # Delete any existing data
    if os.path.exists(path):
        print("deleting current directory: {}".format(path))
        rmtree(path)

    # Make the data directory
    os.mkdir(path)

    # Create the number of subdirectories required
    ndirs   = math.ceil(args.files/args.divide)
    ndate   = date.today().strftime("%Y%m%d")
    subdirs = [
        os.path.join(path, "{}{:03d}".format(ndate, idx))
        for idx in range(1, ndirs+1)
    ]

    for subdir in subdirs:
        os.mkdir(subdir)

    # Create the data in the subdirectories
    for idx in range(args.files):
        sdir  = subdirs[idx % ndirs]
        fpath = os.path.join(sdir, "ft{:04d}".format(idx+1))

        with open(fpath, 'w') as fobj:
            text = gentext(int(args.size * MiB))
            fobj.write(text)

    print("wrote {} {}MiB files to {} subdirectories in {} ({:0.3f} seconds)".format(
        args.files, args.size, ndirs, path, time.time() - start
    ))


def benchmark(args):
    # Run all benchmarks sequentially
    if args.compress == "all":
        cargs = copy(args)
        for algorithm in ("gzip", "bzip2", "zip"):
            cargs.compress = algorithm
            benchmark(cargs)
        return

    # Primary benchmark code; get temp working directory
    tdir  = tempfile.mkdtemp("-benchmark", "compress-")

    # Get the original size and number of files
    osize, nfiles = dirstats(args.dir)

    # First compress the data directory to a tempdir
    arc, ctm = compress(args.dir, os.path.join(tdir, "archive"), args.compress)

    # Compute the compression size
    asize = os.path.getsize(arc)

    # Now extract the compressed archive to the tempdir
    fld, etm = extract(arc, algorithm=args.compress)

    # Save results
    with open(args.results, 'a') as f:
        writer = csv.writer(f)
        # algorithm,osize,nfiles,fsize,compress,extract,asize,percent
        writer.writerow([
            args.compress, osize, nfiles, osize/nfiles,
            ctm, etm, asize, (asize / osize) * 100,
        ])

    # Cleanup the temporary directory
    rmtree(tdir)

    # Print result
    print((
        "{} compressed {} files ({:0.0f}MiB) in {:0.3f} seconds "
        "({:0.1f}% compression) extracted in {:0.3f} seconds."
    ).format(
        args.compress, nfiles, osize/MiB, ctm, (asize / osize) * 100, etm
    ))


##########################################################################
## Main Method
##########################################################################

if __name__ == '__main__':
    # Create the parser and subparser
    parser = argparse.ArgumentParser(
        description="utility to run benchmarks on compression libraries",
        epilog="for more, see the blog post on bbengfort.github.io"
    )
    subparsers = parser.add_subparsers(
        title="commands",
        description="benchmarking utilities"
    )

    # Add the commands and arguments
    gp = subparsers.add_parser('generate', description="generate data for benchmark")
    gp.add_argument('-d', '--divide', metavar="N", type=int, default=10, help="number of files per subdirectory")
    gp.add_argument('-n', '--files', metavar="N", type=int, default=50, help="number of files to generate")
    gp.add_argument('-s', '--size', metavar="MiB", type=int, default=1, help="approximate size in MiB of each file")
    gp.add_argument('dir', nargs="?", type=str, default='data', help="directory to wipe and fill with benchmark data")
    gp.set_defaults(func=generate)

    bp = subparsers.add_parser('benchmark', description="run the benchmark for compression")
    bp.add_argument('-r', '--results', metavar="CSV", type=str, default="results.csv", help="location to append results to")
    bp.add_argument('-c', '--compress', default="gzip", metavar="ZIP", choices={"gzip", "bzip2", "zip", "all"}, help="the compression algorithm to benchmark")
    bp.add_argument('dir', nargs="?", type=str, default='data', help="directory to run the compression benchmark on")
    bp.set_defaults(func=benchmark)

    # Run the parser
    args = parser.parse_args()
    args.func(args)
	#!/usr/bin/env python3
	"""
	Benchmarks for compression -- data generation and runner
	"""

	##########################################################################
	## Imports
	##########################################################################

	import os
	import csv
	import math
	import time
	import lorem
	import tempfile
	import argparse
	import subprocess

	from copy import copy
	from shutil import rmtree
	from datetime import date


	##########################################################################
	## Helpers
	##########################################################################

	MiB = 1.049e+6
	GZIP = ["tar", "-zcf"]
	GUNZIP = ["tar", "-zxf"]
	BZIP2 = ["tar", "-jcf"]
	BUNZIP2 = ["tar", "-jxf"]
	ZIP = ["zip", "-rqX"]
	UNZIP = ["unzip", "-q", "-d"]


	def dirstats(path):
	size, count = 0, 0
	for entry in os.scandir(path):
	if entry.is_file():
	size += entry.stat().st_size
	count += 1
	elif entry.is_dir():
	s, c = dirstats(entry.path)
	size += s
	count += c
	return size, count


	def gentext(size):
	text = ""
	while len(text) <= size:
	text += lorem.paragraph() + "\n\n"
	return text[:size]


	def compress(src, dst, algorithm="gzip"):
	# Use the appropriate extension for the algorithm
	path, _ = os.path.splitext(dst)
	path += {
	'gzip': ".tgz",
	'bzip2': ".tbz2",
	'zip': ".zip",
	}[algorithm]

	# Get the correct args
	args = {
	'gzip': GZIP,
	'bzip2': BZIP2,
	'zip': ZIP,
	}[algorithm] + [path, src]

	# Run the compression and time it.
	start = time.time()
	subprocess.run(args)

	# Return the path and the time delta
	return path, time.time() - start


	def extract(src, dst=None, algorithm="gzip"):
	# If dst is None, extract to same dir as src
	dst = dst or os.path.dirname(src)

	# Get the correct args
	args = {
	'gzip': GUNZIP,
	'bzip2': BUNZIP2,
	'zip': UNZIP,
	}[algorithm]

	if algorithm == "zip":
	args += [dst, src]
	else:
	args += [src, "-C", dst]

	# Run the compression and time it.
	start = time.time()
	subprocess.run(args)

	# Return the path and the time delta
	return dst, time.time() - start


	##########################################################################
	## Primary Utilities
	##########################################################################

	def generate(args):
	start = time.time()
	path = os.path.normpath(args.dir)

	# Delete any existing data
	if os.path.exists(path):
	print("deleting current directory: {}".format(path))
	rmtree(path)

	# Make the data directory
	os.mkdir(path)

	# Create the number of subdirectories required
	ndirs = math.ceil(args.files/args.divide)
	ndate = date.today().strftime("%Y%m%d")
	subdirs = [
	os.path.join(path, "{}{:03d}".format(ndate, idx))
	for idx in range(1, ndirs+1)
	]

	for subdir in subdirs:
	os.mkdir(subdir)

	# Create the data in the subdirectories
	for idx in range(args.files):
	sdir = subdirs[idx % ndirs]
	fpath = os.path.join(sdir, "ft{:04d}".format(idx+1))

	with open(fpath, 'w') as fobj:
	text = gentext(int(args.size * MiB))
	fobj.write(text)

	print("wrote {} {}MiB files to {} subdirectories in {} ({:0.3f} seconds)".format(
	args.files, args.size, ndirs, path, time.time() - start
	))


	def benchmark(args):
	# Run all benchmarks sequentially
	if args.compress == "all":
	cargs = copy(args)
	for algorithm in ("gzip", "bzip2", "zip"):
	cargs.compress = algorithm
	benchmark(cargs)
	return

	# Primary benchmark code; get temp working directory
	tdir = tempfile.mkdtemp("-benchmark", "compress-")

	# Get the original size and number of files
	osize, nfiles = dirstats(args.dir)

	# First compress the data directory to a tempdir
	arc, ctm = compress(args.dir, os.path.join(tdir, "archive"), args.compress)

	# Compute the compression size
	asize = os.path.getsize(arc)

	# Now extract the compressed archive to the tempdir
	fld, etm = extract(arc, algorithm=args.compress)

	# Save results
	with open(args.results, 'a') as f:
	writer = csv.writer(f)
	# algorithm,osize,nfiles,fsize,compress,extract,asize,percent
	writer.writerow([
	args.compress, osize, nfiles, osize/nfiles,
	ctm, etm, asize, (asize / osize) * 100,
	])

	# Cleanup the temporary directory
	rmtree(tdir)

	# Print result
	print((
	"{} compressed {} files ({:0.0f}MiB) in {:0.3f} seconds "
	"({:0.1f}% compression) extracted in {:0.3f} seconds."
	).format(
	args.compress, nfiles, osize/MiB, ctm, (asize / osize) * 100, etm
	))


	##########################################################################
	## Main Method
	##########################################################################

	if __name__ == '__main__':
	# Create the parser and subparser
	parser = argparse.ArgumentParser(
	description="utility to run benchmarks on compression libraries",
	epilog="for more, see the blog post on bbengfort.github.io"
	)
	subparsers = parser.add_subparsers(
	title="commands",
	description="benchmarking utilities"
	)

	# Add the commands and arguments
	gp = subparsers.add_parser('generate', description="generate data for benchmark")
	gp.add_argument('-d', '--divide', metavar="N", type=int, default=10, help="number of files per subdirectory")
	gp.add_argument('-n', '--files', metavar="N", type=int, default=50, help="number of files to generate")
	gp.add_argument('-s', '--size', metavar="MiB", type=int, default=1, help="approximate size in MiB of each file")
	gp.add_argument('dir', nargs="?", type=str, default='data', help="directory to wipe and fill with benchmark data")
	gp.set_defaults(func=generate)

	bp = subparsers.add_parser('benchmark', description="run the benchmark for compression")
	bp.add_argument('-r', '--results', metavar="CSV", type=str, default="results.csv", help="location to append results to")
	bp.add_argument('-c', '--compress', default="gzip", metavar="ZIP", choices={"gzip", "bzip2", "zip", "all"}, help="the compression algorithm to benchmark")
	bp.add_argument('dir', nargs="?", type=str, default='data', help="directory to run the compression benchmark on")
	bp.set_defaults(func=benchmark)

	# Run the parser
	args = parser.parse_args()
	args.func(args)