Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active September 16, 2022 01:14
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
#!/usr/bin/env python3
#
# usage: wacz-images.py <wacz_file>
#
# This program will extract images from the WARC files contained in a WACZ
# file and write them to the current working directory using the image's URL
# as a file location.
#
# You will need to `pip install warcio` for it to work.
#
import sys
import logging
from pathlib import Path
from zipfile import ZipFile
from urllib.parse import urlparse
from warcio import ArchiveIterator
def main():
logging.basicConfig(filename="wacz-images.log", level=logging.INFO)
wacz_file = sys.argv[1]
wacz = ZipFile(wacz_file)
for warc_file in warc_files(wacz):
for rec in ArchiveIterator(wacz.open(warc_file, "r")):
extract(rec)
def warc_files(wacz):
return list(filter(lambda f: f.endswith(".warc.gz"), wacz.namelist()))
def extract(rec):
if rec.rec_type == "response" and rec.http_headers.get("Content-Type", "").startswith("image/"):
path = get_path(rec.rec_headers["WARC-Target-URI"])
if not path.parent.is_dir():
path.parent.mkdir(parents=True)
try:
logging.info(path)
path.open('wb').write(rec.content_stream().read())
except OSError as e:
logging.error(e)
def get_path(url):
uri = urlparse(url)
return Path(uri.netloc + uri.path + uri.query)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment