-
-
Save simonw/007c628ceb84d0da0795b57af7b74d7d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Extract HAR file contents into multiple files.""" | |
| import json | |
| import os | |
| import base64 | |
| from urllib.parse import urlparse | |
| from pathlib import Path | |
| def sanitize_filename(name): | |
| """Sanitize a string to be safe for use as a filename.""" | |
| # Replace problematic characters | |
| for char in ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '&', '=']: | |
| name = name.replace(char, '_') | |
| return name[:100] # Limit length to avoid filesystem issues | |
| def get_extension_from_mime(mime_type): | |
| """Get file extension from MIME type.""" | |
| mime_map = { | |
| 'text/html': '.html', | |
| 'text/css': '.css', | |
| 'text/javascript': '.js', | |
| 'application/javascript': '.js', | |
| 'application/json': '.json', | |
| 'image/png': '.png', | |
| 'image/jpeg': '.jpg', | |
| 'image/gif': '.gif', | |
| 'image/webp': '.webp', | |
| 'image/svg+xml': '.svg', | |
| 'font/woff': '.woff', | |
| 'font/woff2': '.woff2', | |
| 'application/font-woff': '.woff', | |
| 'application/font-woff2': '.woff2', | |
| } | |
| if mime_type: | |
| base_mime = mime_type.split(';')[0].strip() | |
| return mime_map.get(base_mime, '') | |
| return '' | |
| def extract_har(har_path, output_dir): | |
| """Extract HAR file into multiple files.""" | |
| with open(har_path, 'r', encoding='utf-8') as f: | |
| har = json.load(f) | |
| log = har['log'] | |
| # Create output directories | |
| os.makedirs(output_dir, exist_ok=True) | |
| responses_dir = os.path.join(output_dir, 'responses') | |
| os.makedirs(responses_dir, exist_ok=True) | |
| # Extract metadata | |
| metadata = { | |
| 'version': log.get('version'), | |
| 'creator': log.get('creator'), | |
| 'browser': log.get('browser'), | |
| 'pages': log.get('pages', []), | |
| 'entries_count': len(log.get('entries', [])) | |
| } | |
| with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f: | |
| json.dump(metadata, f, indent=2) | |
| print(f"Wrote metadata.json") | |
| # Extract entries | |
| entries = log.get('entries', []) | |
| entries_summary = [] | |
| for i, entry in enumerate(entries): | |
| request = entry.get('request', {}) | |
| response = entry.get('response', {}) | |
| url = request.get('url', '') | |
| method = request.get('method', 'GET') | |
| status = response.get('status', 0) | |
| parsed_url = urlparse(url) | |
| path = parsed_url.path or '/' | |
| # Create filename from URL | |
| filename_base = sanitize_filename(parsed_url.netloc + parsed_url.path) | |
| if parsed_url.query: | |
| filename_base += '_' + sanitize_filename(parsed_url.query) | |
| if not filename_base: | |
| filename_base = 'index' | |
| # Add index to ensure uniqueness | |
| filename_base = f"{i:04d}_{filename_base}" | |
| content = response.get('content', {}) | |
| mime_type = content.get('mimeType', '') | |
| text = content.get('text', '') | |
| encoding = content.get('encoding', '') | |
| entry_summary = { | |
| 'index': i, | |
| 'method': method, | |
| 'url': url, | |
| 'status': status, | |
| 'mimeType': mime_type, | |
| 'size': content.get('size', 0), | |
| 'timings': entry.get('timings', {}), | |
| } | |
| # Save response content if present | |
| if text: | |
| ext = get_extension_from_mime(mime_type) | |
| if not ext: | |
| # Try to get from URL | |
| url_ext = os.path.splitext(parsed_url.path)[1] | |
| ext = url_ext if url_ext else '.txt' | |
| response_filename = filename_base + ext | |
| response_path = os.path.join(responses_dir, response_filename) | |
| try: | |
| if encoding == 'base64': | |
| # Binary content | |
| with open(response_path, 'wb') as f: | |
| f.write(base64.b64decode(text)) | |
| else: | |
| # Text content | |
| with open(response_path, 'w', encoding='utf-8') as f: | |
| f.write(text) | |
| entry_summary['response_file'] = os.path.join('responses', response_filename) | |
| print(f"Extracted: {response_filename}") | |
| except Exception as e: | |
| entry_summary['error'] = str(e) | |
| print(f"Error extracting {url}: {e}") | |
| # Save request/response headers and details | |
| entry_detail = { | |
| 'request': { | |
| 'method': method, | |
| 'url': url, | |
| 'httpVersion': request.get('httpVersion'), | |
| 'headers': request.get('headers', []), | |
| 'queryString': request.get('queryString', []), | |
| 'cookies': request.get('cookies', []), | |
| }, | |
| 'response': { | |
| 'status': status, | |
| 'statusText': response.get('statusText'), | |
| 'httpVersion': response.get('httpVersion'), | |
| 'headers': response.get('headers', []), | |
| 'cookies': response.get('cookies', []), | |
| 'content': { | |
| 'size': content.get('size'), | |
| 'mimeType': mime_type, | |
| 'compression': content.get('compression'), | |
| } | |
| }, | |
| 'timings': entry.get('timings', {}), | |
| 'serverIPAddress': entry.get('serverIPAddress'), | |
| 'startedDateTime': entry.get('startedDateTime'), | |
| } | |
| detail_filename = filename_base + '_detail.json' | |
| with open(os.path.join(responses_dir, detail_filename), 'w', encoding='utf-8') as f: | |
| json.dump(entry_detail, f, indent=2) | |
| entry_summary['detail_file'] = os.path.join('responses', detail_filename) | |
| entries_summary.append(entry_summary) | |
| # Write entries index | |
| with open(os.path.join(output_dir, 'entries_index.json'), 'w', encoding='utf-8') as f: | |
| json.dump(entries_summary, f, indent=2) | |
| print(f"\nWrote entries_index.json with {len(entries_summary)} entries") | |
| print(f"\nExtraction complete! Output directory: {output_dir}") | |
| return len(entries_summary) | |
| if __name__ == '__main__': | |
| har_file = '/private/tmp/har/theaidigest-org-village.har' | |
| output_dir = '/private/tmp/har/extracted' | |
| extract_har(har_file, output_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment