Skip to content

Instantly share code, notes, and snippets.

@willkg
Created October 9, 2020 00:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save willkg/02d82dcbc5cf9b00cab74992e2586819 to your computer and use it in GitHub Desktop.
Save willkg/02d82dcbc5cf9b00cab74992e2586819 to your computer and use it in GitHub Desktop.
Script to compress files with makecab and upload
#!/usr/bin/env python
"""
October 8th, 2020. I wrote this to fix a problem where .dll and .exe files were getting
uploaded to symbols.mozilla.org, but hadn't been run through makecab. That prevented
them from getting served by symbols.mozilla.org and that broke debugging efforts.
I ran this script in WSL2 on Windows. It downloads files from the symbols bucket, runs
makecab on them, batches them up, and periodically uploads a symbols.zip file.
Since there are so many files involved and my Windows machine is flaky and the network
is flaky and life is flaky, it tries to be resilient and keeps track of progress so it
can pick up where it left off.
"""
import datetime
import os
import pathlib
import shutil
import subprocess
import sys
import time
import urllib3
import zipfile
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
BUCKET_URL = "https://s3-us-west-2.amazonaws.com/org.mozilla.crash-stats.symbols-public/v1/"
DOWNLOAD_API = "https://symbols.mozilla.org/"
UPLOAD_API = "https://symbols.mozilla.org/upload/"
# FIXME(willkg): use your auth token
UPLOAD_AUTH_TOKEN = ""
MAKECAB = "makecab.exe"
DATADIR = "data/"
SYMDIR = "symbols/"
# 300mb
SYMBOLS_ZIP_GOOD_SIZE = 150 * 1024 * 1024
class HTTPAdapterWithTimeout(HTTPAdapter):
def __init__(self, *args, **kwargs):
self._default_timeout = kwargs.pop("default_timeout", 5.0)
super().__init__(*args, **kwargs)
def send(self, *args, **kwargs):
kwargs["timeout"] = kwargs.get("timeout") or self._default_timeout
return super().send(*args, **kwargs)
def retry_session():
session = requests.Session()
retries = Retry(
total=5,
read=5,
connect=5,
backoff_factor=0.2,
# NOTE(willkg): This is in addition to 429
status_forcelist=(500, 502, 503, 504),
)
# Set user agent
session.headers.update({"User-Agent": "willkg/comp_and_upload"})
adapter = HTTPAdapterWithTimeout(max_retries=retries, default_timeout=5.0)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def makedir(dn):
if not os.path.exists(dn):
os.makedirs(dn)
def try_harder(fun):
def _try_harder(*args, **kwargs):
while True:
try:
return fun(*args, **kwargs)
except (urllib3.exceptions.ProtocolError, requests.exceptions.ConnectionError) as pe:
print(f"try_harder error {pe!r}")
time.sleep(1)
return _try_harder
@try_harder
def head_file(session, fn):
compressed_filename = fn[:-1] + "_"
url = BUCKET_URL + compressed_filename
# See if it's there already
resp = session.head(url)
# print(url, resp.status_code)
return resp.status_code == 200
@try_harder
def download_file(session, fn, dest_fn):
if os.path.exists(dest_fn):
return True
url = BUCKET_URL + fn
resp = session.get(url)
if resp.status_code != 200:
print(">>> ERROR: GET %s %s" % (url, resp.status_code))
return False
makedir(os.path.dirname(dest_fn))
print(">>> Downloaded file: %s" % url)
with open(dest_fn, "wb") as fp:
data = resp.content
print(f">>> Wrote {len(data):,} bytes")
fp.write(resp.content)
return True
def compress_file(cache_fn, symbols_tmp_fn):
makedir(os.path.dirname(symbols_tmp_fn))
subprocess.check_call(
[MAKECAB, "-D", "CompressionType=MSZIP", cache_fn, symbols_tmp_fn],
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT
)
print(f">>> Compressed: {get_size(cache_fn):,} -> {get_size(symbols_tmp_fn):,}")
def build_zip_file(zip_filename, sym_dir):
"""Generates a ZIP file of contents of sym dir.
:param zip_filename: full path to zip file
:param sym_dir: full path to directory of SYM files
:returns: path to zip file
"""
with zipfile.ZipFile(zip_filename, mode="w") as fp:
for root, dirs, files in os.walk(sym_dir):
if not files:
continue
for sym_file in files:
full_path = os.path.join(root, sym_file)
arcname = full_path[len(sym_dir):]
fp.write(
full_path,
arcname=arcname,
compress_type=zipfile.ZIP_DEFLATED,
)
@try_harder
def upload_zip_file(session, zipfile):
url = UPLOAD_API
headers = {"auth-token": UPLOAD_AUTH_TOKEN}
with open("symbols.zip", "rb") as fp:
data = fp.read()
resp = session.post(
url,
headers=headers,
allow_redirects=False,
timeout=(300, 300),
files={"symbols.zip": data}
)
if resp.status_code != 200:
resp.raise_for_status()
def get_size(fn):
"""Get the size of a file.
:param filename: the filename to check
:returns: 0 if the file doesn't exist; file size otherwise
"""
if not os.path.exists(fn):
return 0
return os.stat(fn).st_size
def niceify_seconds(s):
if s > 3600:
return f"{s / 60 / 60:,.0f}h"
if s > 60:
return f"{s / 60:,.0f}m"
return f"{s:,}s"
def handle_keyboard_interrupt(fun):
def _handle_keyboard_interrupt(*args, **kwargs):
try:
return fun(*args, **kwargs)
except KeyboardInterrupt:
print("KeyboardInterrupt!")
sys.exit(1)
return _handle_keyboard_interrupt
@handle_keyboard_interrupt
def main():
skip = 0
if len(sys.argv) > 1:
skip = int(sys.argv[1])
elif os.path.exists("last_upload.txt"):
with open("last_upload.txt", "r") as fp:
skip = int(fp.read().strip())
with open("uncompressed_keys.txt", "r") as fp:
lines = fp.readlines()
# Remove things from previous run of the script if they're there
if os.path.exists(SYMDIR):
shutil.rmtree(SYMDIR)
pathlib.Path("symbols.zip").unlink(missing_ok=True)
print(">>> Number of files: %s" % len(lines))
total_size = 0
total_files = 0
start_time = time.time()
session = retry_session()
for i, line in enumerate(lines):
if i < skip:
continue
fn = line.strip()
if not fn or line.startswith("#"):
continue
print(">>> Working on %s (%d/%d)" % (fn, i, len(lines)))
print(">>> %s" % datetime.datetime.now())
# Check if the file is there--if it is, skip it
if head_file(session, fn):
print(">>> already there--skipping")
continue
cache_fn = os.path.join(DATADIR, fn)
symbols_tmp_fn = os.path.join(SYMDIR, fn[:-1] + "_")
# If we don't have the file on disk, download it
if not download_file(session, fn, cache_fn):
continue
# Compress file to symbols tmp dir
compress_file(cache_fn, symbols_tmp_fn)
file_size = get_size(symbols_tmp_fn)
total_size += file_size
total_files += 1
# Add file sizes up and if that's less than the good size, we can just
# blithly skip along without building and computing a zip file size
if total_size < SYMBOLS_ZIP_GOOD_SIZE:
print(f">>> accumulated size: {total_size:,}")
continue
# If zip file is big enough, upload it
build_zip_file("symbols.zip", SYMDIR)
symbols_zip_size = get_size("symbols.zip")
print(f">>> symbols.zip size: {symbols_zip_size:,}")
print(">>> uploading ...")
upload_zip_file(session, "symbols.zip")
# Save the line number of the last upload so we can re-start there if the script
# dies
with open("last_upload.txt", "w") as fp:
fp.write("%s" % i)
total_time = time.time() - start_time
est_left = niceify_seconds(((time.time() - start_time) / total_files) * (len(lines) - i))
print(f"Total: {total_time:,}s Files: {i}/{len(lines)} Est left: {est_left}")
print("")
# Clean up this pass
total_size = 0
shutil.rmtree(SYMDIR)
pathlib.Path("symbols.zip").unlink(missing_ok=True)
# If zip file is big enough, upload it
build_zip_file("symbols.zip", SYMDIR)
symbols_zip_size = get_size("symbols.zip")
print(f">>> symbols.zip size: {symbols_zip_size:,}")
print(">>> uploading ...")
upload_zip_file(session, "symbols.zip")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment