Skip to content

Instantly share code, notes, and snippets.

@davidwu111
Created May 28, 2023 03:45
Show Gist options
  • Save davidwu111/747bfbd14b6122f85da2fb802cf339d1 to your computer and use it in GitHub Desktop.
Save davidwu111/747bfbd14b6122f85da2fb802cf339d1 to your computer and use it in GitHub Desktop.
A quick Python script to filter all URL, domain and domain suffixes from HAR file.
import json
from urllib.parse import urlparse
import tldextract
def extract_urls(har_file_path, url_file_path, domain_txt_file_path, suffix_txt_file_path):
with open(har_file_path, 'r', encoding='utf-8') as f:
har_data = json.load(f)
urls = []
domains = set()
second_domains = set()
for entry in har_data['log']['entries']:
url = entry['request']['url']
urls.append(url)
domain = urlparse(url).netloc
domains.add(domain)
second_domain_extract = tldextract.extract(url)
second_domain = '.'.join(part for part in second_domain_extract[1:] if part)
second_domains.add(second_domain)
with open(url_file_path, 'w', encoding='utf-8') as f:
for url in urls:
f.write(url + '\n')
with open(domain_txt_file_path, 'w', encoding='utf-8') as f:
for domain in domains:
f.write(domain + '\n')
with open(suffix_txt_file_path, 'w', encoding='utf-8') as f:
for second_domain in second_domains:
f.write(second_domain + '\n')
# replace the path/to/your/har/file.har with the path to your HAR file
extract_urls("path/to/your/har/file.har", "urls.txt", "domains.txt", "domain-suffixes.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment