Skip to content

Instantly share code, notes, and snippets.

@lebowitz
Forked from Lewiscowles1986/extract_har.py
Last active February 6, 2024 15:13
Show Gist options
  • Save lebowitz/5e11958b7578a5edf12412af1c5d7119 to your computer and use it in GitHub Desktop.
Save lebowitz/5e11958b7578a5edf12412af1c5d7119 to your computer and use it in GitHub Desktop.
Python 3 script to extract images from HTTP Archive (HAR) files
import json
import base64
import os
import pathlib
from urllib.parse import urlparse
# list of supported image mime-types
# Special thanks to https://gist.github.com/FurloSK/0477e01024f701db42341fc3223a5d8c
# Special mention, and thanks to MDN
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
mimetypes = {
"image/webp": ".webp",
"image/jpeg": ".jpeg", # *.jpg files have two possible extensions
"image/jpeg": ".jpg", # (but .jpeg is official and thus preferred)
"image/jpg": ".jpg", # (but .jpeg is official and thus preferred)
"image/png": ".png",
"image/svg+xml": ".svg",
"image/avif": ".avif",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/vnd.microsoft.icon": ".ico",
"image/tiff": ".tif", # *.tiff files have two possible extensions
"image/tiff": ".tiff", # (but .tiff is what I know and prefer)
}
# make sure the output directory exists before running!
folder = os.path.join(os.getcwd(), "imgs")
os.makedirs(folder, exist_ok=True)
with open("src.har", "rb") as f:
har = json.loads(f.read())
entries = har["log"]["entries"]
for entry in entries:
mimetype = entry["response"]["content"]["mimeType"]
url = urlparse(entry["request"]["url"])
filename = url.path[1:].replace('/','_')
print(filename)
response_text = entry["response"]["content"].get("text")
encoding = entry["response"]["content"].get("encoding", "literal")
if not response_text:
continue
# Python lets you lookup values against dictionaries using the in keyword
if mimetype in mimetypes:
ext = mimetypes[mimetype]
file = os.path.join(folder, f"{filename}{ext}")
print(file)
with open(file, "wb") as f:
f.write(
response_text.encode(encoding = "UTF-8", errors = "strict")
if encoding == "literal"
else base64.b64decode(response_text)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment