Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active September 2, 2023 18:43
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edsu/c2c85f633562e69f6db2a2d2d2c2517e to your computer and use it in GitHub Desktop.
Save edsu/c2c85f633562e69f6db2a2d2d2c2517e to your computer and use it in GitHub Desktop.
Extract images from a WARC file. usage: extract_images.py <warc_file>
#!/usr/bin/env python3
import sys
import pathlib
from urllib.parse import urlparse
from warcio.archiveiterator import ArchiveIterator
def save(url, stream):
uri = urlparse(url)
path = pathlib.Path(uri.netloc + '/' + uri.path)
path.parent.mkdir(parents=True, exist_ok=True)
path.open('wb').write(stream.read())
print(path)
def extract_images(warc_file):
with open(warc_file, 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type == 'response':
url = record.rec_headers.get_header('WARC-Target-URI')
content_type = record.http_headers.get_header('Content-Type')
if 'image' in content_type:
save(url, record.content_stream())
if __name__ == "__main__":
if len(sys.argv) != 2:
sys.exit('usage: extract_images.py <warc_file>')
warc_file = sys.argv[1]
extract_images(warc_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment