edsu/wacz-images.py

## wacz-images.py
#!/usr/bin/env python3

#
# usage: wacz-images.py <wacz_file>
#
# This program will extract images from the WARC files contained in a WACZ
# file and write them to the current working directory using the image's URL
# as a file location.
#
# You will need to `pip install warcio` for it to work.
#

import sys
import logging

from pathlib import Path
from zipfile import ZipFile
from urllib.parse import urlparse
from warcio import ArchiveIterator

def main():
    logging.basicConfig(filename="wacz-images.log", level=logging.INFO)
    wacz_file = sys.argv[1]
    wacz = ZipFile(wacz_file)
    for warc_file in warc_files(wacz):
        for rec in ArchiveIterator(wacz.open(warc_file, "r")):
            extract(rec)

def warc_files(wacz):
    return list(filter(lambda f: f.endswith(".warc.gz"), wacz.namelist()))

def extract(rec):
    if rec.rec_type == "response" and rec.http_headers.get("Content-Type", "").startswith("image/"):
        path = get_path(rec.rec_headers["WARC-Target-URI"])
        if not path.parent.is_dir():
            path.parent.mkdir(parents=True)
        try:
            logging.info(path)
            path.open('wb').write(rec.content_stream().read())
        except OSError as e:
            logging.error(e)

def get_path(url):
    uri = urlparse(url)
    return Path(uri.netloc + uri.path + uri.query)

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	#
	# usage: wacz-images.py <wacz_file>
	#
	# This program will extract images from the WARC files contained in a WACZ
	# file and write them to the current working directory using the image's URL
	# as a file location.
	#
	# You will need to `pip install warcio` for it to work.
	#

	import sys
	import logging

	from pathlib import Path
	from zipfile import ZipFile
	from urllib.parse import urlparse
	from warcio import ArchiveIterator

	def main():
	logging.basicConfig(filename="wacz-images.log", level=logging.INFO)
	wacz_file = sys.argv[1]
	wacz = ZipFile(wacz_file)
	for warc_file in warc_files(wacz):
	for rec in ArchiveIterator(wacz.open(warc_file, "r")):
	extract(rec)

	def warc_files(wacz):
	return list(filter(lambda f: f.endswith(".warc.gz"), wacz.namelist()))

	def extract(rec):
	if rec.rec_type == "response" and rec.http_headers.get("Content-Type", "").startswith("image/"):
	path = get_path(rec.rec_headers["WARC-Target-URI"])
	if not path.parent.is_dir():
	path.parent.mkdir(parents=True)
	try:
	logging.info(path)
	path.open('wb').write(rec.content_stream().read())
	except OSError as e:
	logging.error(e)

	def get_path(url):
	uri = urlparse(url)
	return Path(uri.netloc + uri.path + uri.query)

	if __name__ == "__main__":
	main()