Skip to content

Instantly share code, notes, and snippets.

Last active September 16, 2022 01:14
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
#!/usr/bin/env python3
# usage: <wacz_file>
# This program will extract images from the WARC files contained in a WACZ
# file and write them to the current working directory using the image's URL
# as a file location.
# You will need to `pip install warcio` for it to work.
import sys
import logging
from pathlib import Path
from zipfile import ZipFile
from urllib.parse import urlparse
from warcio import ArchiveIterator
def main():
logging.basicConfig(filename="wacz-images.log", level=logging.INFO)
wacz_file = sys.argv[1]
wacz = ZipFile(wacz_file)
for warc_file in warc_files(wacz):
for rec in ArchiveIterator(, "r")):
def warc_files(wacz):
return list(filter(lambda f: f.endswith(".warc.gz"), wacz.namelist()))
def extract(rec):
if rec.rec_type == "response" and rec.http_headers.get("Content-Type", "").startswith("image/"):
path = get_path(rec.rec_headers["WARC-Target-URI"])
if not path.parent.is_dir():
except OSError as e:
def get_path(url):
uri = urlparse(url)
return Path(uri.netloc + uri.path + uri.query)
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment