Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active February 19, 2024 03:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edsu/b3b5195bd5c16f554017bac3cd90a53b to your computer and use it in GitHub Desktop.
Save edsu/b3b5195bd5c16f554017bac3cd90a53b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#
# usage: wacz-images.py <wacz_file>
#
# This program will extract images from the WARC files contained in a WACZ
# file and write them to the current working directory using the image's URL
# as a file location.
#
# You will need to `pip install warcio` for it to work.
#
import sys
import logging
from pathlib import Path
from zipfile import ZipFile
from urllib.parse import urlparse
from warcio import ArchiveIterator
def main():
logging.basicConfig(filename="wacz-images.log", level=logging.INFO)
wacz_file = sys.argv[1]
wacz = ZipFile(wacz_file)
for warc_file in warc_files(wacz):
for rec in ArchiveIterator(wacz.open(warc_file, "r")):
extract(rec)
def warc_files(wacz):
return list(filter(lambda f: f.endswith(".warc.gz"), wacz.namelist()))
def extract(rec):
if rec.rec_type == "response" and rec.http_headers.get("Content-Type", "").startswith("image/"):
path = get_path(rec.rec_headers["WARC-Target-URI"])
if not path.parent.is_dir():
path.parent.mkdir(parents=True)
try:
logging.info(path)
path.open('wb').write(rec.content_stream().read())
except OSError as e:
logging.error(e)
def get_path(url):
uri = urlparse(url)
return Path(uri.netloc + uri.path + uri.query)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment