Skip to content

Instantly share code, notes, and snippets.

@Lewiscowles1986
Forked from kafran/extract_har.py
Last active May 8, 2024 17:00
Show Gist options
  • Save Lewiscowles1986/645e79295efa84698f4e45cd06d610ea to your computer and use it in GitHub Desktop.
Save Lewiscowles1986/645e79295efa84698f4e45cd06d610ea to your computer and use it in GitHub Desktop.
Python 3 script to extract images from HTTP Archive (HAR) files
import json
import base64
import os
import pathlib
from urllib.parse import urlparse
# list of supported image mime-types
# Special thanks to https://gist.github.com/FurloSK/0477e01024f701db42341fc3223a5d8c
# Special mention, and thanks to MDN
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
mimetypes = {
"image/webp": ".webp",
"image/jpeg": ".jpeg", # *.jpg files have two possible extensions
"image/jpeg": ".jpg", # (but .jpeg is official and thus preferred)
"image/png": ".png",
"image/svg+xml": ".svg",
"image/avif": ".avif",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/vnd.microsoft.icon": ".ico",
"image/tiff": ".tif", # *.tiff files have two possible extensions
"image/tiff": ".tiff", # (but .tiff is what I know and prefer)
}
# make sure the output directory exists before running!
folder = os.path.join(os.getcwd(), "imgs")
with open("src.har", "rb") as f:
har = json.loads(f.read())
entries = har["log"]["entries"]
for entry in entries:
mimetype = entry["response"]["content"]["mimeType"]
url = urlparse(entry["request"]["url"])
path = pathlib.Path(url.path)
filename = path.stem
response_text = entry["response"]["content"].get("text")
encoding = entry["response"]["content"].get("encoding", "literal")
if not response_text:
continue
# Python lets you lookup values against dictionaries using the in keyword
if mimetype in mimetypes:
ext = mimetypes[mimetype]
file = os.path.join(folder, str(path.parent)[1:], f"{filename}{ext}")
os.makedirs(os.path.join(folder, str(path.parent)[1:]), exist_ok=True)
print(file)
with open(file, "wb") as f:
f.write(
response_text.encode(encoding = "UTF-8", errors = "strict")
if encoding == "literal"
else base64.b64decode(response_text)
)
@dnk8n
Copy link

dnk8n commented Nov 11, 2023

Would you like to please include an open-source or other type of license so that we know how we are legally allowed to use your code?

@Lewiscowles1986
Copy link
Author

dnk8n, however the heck you like; use it to burn baby sheep for all I care.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment