Skip to content

Instantly share code, notes, and snippets.

@simonw
Created December 26, 2025 17:41
Show Gist options
  • Select an option

  • Save simonw/007c628ceb84d0da0795b57af7b74d7d to your computer and use it in GitHub Desktop.

Select an option

Save simonw/007c628ceb84d0da0795b57af7b74d7d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Extract HAR file contents into multiple files."""
import json
import os
import base64
from urllib.parse import urlparse
from pathlib import Path
def sanitize_filename(name):
"""Sanitize a string to be safe for use as a filename."""
# Replace problematic characters
for char in ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '&', '=']:
name = name.replace(char, '_')
return name[:100] # Limit length to avoid filesystem issues
def get_extension_from_mime(mime_type):
"""Get file extension from MIME type."""
mime_map = {
'text/html': '.html',
'text/css': '.css',
'text/javascript': '.js',
'application/javascript': '.js',
'application/json': '.json',
'image/png': '.png',
'image/jpeg': '.jpg',
'image/gif': '.gif',
'image/webp': '.webp',
'image/svg+xml': '.svg',
'font/woff': '.woff',
'font/woff2': '.woff2',
'application/font-woff': '.woff',
'application/font-woff2': '.woff2',
}
if mime_type:
base_mime = mime_type.split(';')[0].strip()
return mime_map.get(base_mime, '')
return ''
def extract_har(har_path, output_dir):
"""Extract HAR file into multiple files."""
with open(har_path, 'r', encoding='utf-8') as f:
har = json.load(f)
log = har['log']
# Create output directories
os.makedirs(output_dir, exist_ok=True)
responses_dir = os.path.join(output_dir, 'responses')
os.makedirs(responses_dir, exist_ok=True)
# Extract metadata
metadata = {
'version': log.get('version'),
'creator': log.get('creator'),
'browser': log.get('browser'),
'pages': log.get('pages', []),
'entries_count': len(log.get('entries', []))
}
with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=2)
print(f"Wrote metadata.json")
# Extract entries
entries = log.get('entries', [])
entries_summary = []
for i, entry in enumerate(entries):
request = entry.get('request', {})
response = entry.get('response', {})
url = request.get('url', '')
method = request.get('method', 'GET')
status = response.get('status', 0)
parsed_url = urlparse(url)
path = parsed_url.path or '/'
# Create filename from URL
filename_base = sanitize_filename(parsed_url.netloc + parsed_url.path)
if parsed_url.query:
filename_base += '_' + sanitize_filename(parsed_url.query)
if not filename_base:
filename_base = 'index'
# Add index to ensure uniqueness
filename_base = f"{i:04d}_{filename_base}"
content = response.get('content', {})
mime_type = content.get('mimeType', '')
text = content.get('text', '')
encoding = content.get('encoding', '')
entry_summary = {
'index': i,
'method': method,
'url': url,
'status': status,
'mimeType': mime_type,
'size': content.get('size', 0),
'timings': entry.get('timings', {}),
}
# Save response content if present
if text:
ext = get_extension_from_mime(mime_type)
if not ext:
# Try to get from URL
url_ext = os.path.splitext(parsed_url.path)[1]
ext = url_ext if url_ext else '.txt'
response_filename = filename_base + ext
response_path = os.path.join(responses_dir, response_filename)
try:
if encoding == 'base64':
# Binary content
with open(response_path, 'wb') as f:
f.write(base64.b64decode(text))
else:
# Text content
with open(response_path, 'w', encoding='utf-8') as f:
f.write(text)
entry_summary['response_file'] = os.path.join('responses', response_filename)
print(f"Extracted: {response_filename}")
except Exception as e:
entry_summary['error'] = str(e)
print(f"Error extracting {url}: {e}")
# Save request/response headers and details
entry_detail = {
'request': {
'method': method,
'url': url,
'httpVersion': request.get('httpVersion'),
'headers': request.get('headers', []),
'queryString': request.get('queryString', []),
'cookies': request.get('cookies', []),
},
'response': {
'status': status,
'statusText': response.get('statusText'),
'httpVersion': response.get('httpVersion'),
'headers': response.get('headers', []),
'cookies': response.get('cookies', []),
'content': {
'size': content.get('size'),
'mimeType': mime_type,
'compression': content.get('compression'),
}
},
'timings': entry.get('timings', {}),
'serverIPAddress': entry.get('serverIPAddress'),
'startedDateTime': entry.get('startedDateTime'),
}
detail_filename = filename_base + '_detail.json'
with open(os.path.join(responses_dir, detail_filename), 'w', encoding='utf-8') as f:
json.dump(entry_detail, f, indent=2)
entry_summary['detail_file'] = os.path.join('responses', detail_filename)
entries_summary.append(entry_summary)
# Write entries index
with open(os.path.join(output_dir, 'entries_index.json'), 'w', encoding='utf-8') as f:
json.dump(entries_summary, f, indent=2)
print(f"\nWrote entries_index.json with {len(entries_summary)} entries")
print(f"\nExtraction complete! Output directory: {output_dir}")
return len(entries_summary)
if __name__ == '__main__':
har_file = '/private/tmp/har/theaidigest-org-village.har'
output_dir = '/private/tmp/har/extracted'
extract_har(har_file, output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment