Skip to content

Instantly share code, notes, and snippets.

@catlee
Created May 31, 2019 12:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save catlee/80e4abae57d98180b755838ddbda6e04 to your computer and use it in GitHub Desktop.
Save catlee/80e4abae57d98180b755838ddbda6e04 to your computer and use it in GitHub Desktop.
Analyze compression formats
#!/usr/bin/env python3
from pathlib import Path
import logging
from contextlib import contextmanager
import shutil
import tempfile
import subprocess
import time
import requests
log = logging.getLogger(__name__)
class Timer:
def __init__(self):
self.results = []
self.start = 0
def __enter__(self):
self.start = time.time()
def __exit__(self, *args, **kwargs):
e = time.time() - self.start
self.results.append(e)
@contextmanager
def tmpdir():
d = tempfile.mkdtemp()
try:
yield Path(d)
finally:
shutil.rmtree(d)
def download_file(url, dest):
resp = requests.get(url)
dest.write_bytes(resp.content)
def unpack(f, d):
"""Unpack archive f into directory d"""
log.debug("Unpacking %s into %s", f, d)
d.mkdir(exist_ok=True)
if f.suffix == ".zip":
subprocess.run(["unzip", "-q", f.resolve()], cwd=d, check=True)
elif f.suffix in (".tar", ".tgz") or f.suffixes[-2] == ".tar":
subprocess.run(["tar", "xf", f.resolve()], cwd=d, check=True)
elif f.suffix in (".dmg", ".exe"):
subprocess.run(
["7z", "x", f.resolve()],
cwd=d,
check=True,
stderr=subprocess.STDOUT,
stdout=subprocess.DEVNULL,
)
else:
raise ValueError("Unsupported archive type: %s" % f.suffixes)
def tar_up(d, f, compress_command):
"""Tar up directory d into a tarball f using compress_command, which should
produce its compressed output on stdout"""
tar_proc = subprocess.Popen(
["tar", "c", d.name], cwd=d.parent, stdout=subprocess.PIPE
)
compress_proc = subprocess.Popen(
compress_command, stdin=tar_proc.stdout, stdout=f.open("wb")
)
compress_proc.wait()
tar_proc.wait()
if compress_proc.returncode != 0:
raise OSError("Compression failed")
if tar_proc.returncode != 0:
raise OSError("Tarring failed")
def analyze_formats(f):
attempts = 3
methods = [
("xz (default)", ["xz", "-T0", "-c"], ".tar.xz"),
("xz (max)", ["xz", "-T0", "-9e", "-c"], ".tar.xz"),
("zstd (default)", ["zstd", "-T0", "-c"], ".tar.zst"),
("zstd (max)", ["zstd", "-T0", "-19", "-c"], ".tar.zst"),
("gzip (default)", ["gzip", "-c"], ".tar.gz"),
("gzip (max)", ["gzip", "-9", "-c"], ".tar.gz"),
("bzip2 (default)", ["bzip2", "-c"], ".tar.bz2"),
]
results = {}
# First unpack it
with tmpdir() as d:
unpack_dir = d / "unpacked"
log.debug("unpacking %s into %s", f, unpack_dir)
unpack(f, unpack_dir)
for name, compress_command, suffix in methods:
t = Timer()
dest = d / f"compressed{suffix}"
log.info("testing method %s", name)
log.debug(
"%s: compressing %s into %s using %s",
name,
unpack_dir,
dest,
compress_command,
)
sizes = []
for i in range(attempts):
if dest.exists():
dest.unlink()
with t:
log.debug("compress attempt %d/%d", i + 1, attempts)
tar_up(unpack_dir, dest, compress_command)
sizes.append(dest.stat().st_size)
log.debug("compression times: %s", t.results)
log.info("best compression time: %.2f", min(t.results))
log.debug("sizes: %s", sizes)
if not len(set(sizes)) == 1:
log.warning("output size is non-deterministic!")
results[name] = {"compress_time": min(t.results), "size": min(sizes)}
t = Timer()
for i in range(attempts):
with tmpdir() as d1:
with t:
log.debug("decompress attempt %d/%d", i + 1, attempts)
unpack(dest, d1)
log.debug("decompression times: %s", t.results)
log.info("best decompression time: %.2f", min(t.results))
results[name]["decompress_time"] = min(t.results)
return results
def calc_fitness(compress_time, decompress_time, size):
transfer_speed = 50_000_000 # 50 MB/s
transfer_time = size / transfer_speed
return compress_time + 20 * (decompress_time + transfer_time)
if __name__ == "__main__":
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
DATA_DIR = (Path(__file__).parent / "data").resolve()
DATA_DIR.mkdir(exist_ok=True)
PACKAGES = [
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/mozharness.zip",
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/firefox-69.0a1.en-US.win64.common.tests.tar.gz",
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/firefox-69.0a1.en-US.win64.installer.exe",
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/firefox-69.0a1.en-US.linux-x86_64.web-platform.tests.tar.gz",
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/firefox-69.0a1.en-US.mac.crashreporter-symbols.zip",
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/firefox-69.0a1.en-US.mac.dmg",
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/firefox-69.0a1.en-US.win64.zip",
"https://archive.mozilla.org/pub/firefox/nightly/2019/05/2019-05-29-21-52-51-mozilla-central/firefox-69.0a1.en-US.linux-x86_64.tar.bz2",
]
for url in PACKAGES:
name = url.split("/")[-1]
dest = DATA_DIR / name
if not dest.exists():
log.info("Downloading %s to %s", url, dest)
download_file(url, dest)
log.info("Analyzing %s", dest.name)
results = analyze_formats(dest)
fitness = [(name, calc_fitness(**result)) for name, result in results.items()]
log.info("FITNESS for %s:", dest.name)
for name, f in sorted(fitness, key=lambda x: x[1], reverse=True):
log.info("%s: %.2f", name, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment