akost/har_request_urls.py

## har_request_urls.py
#!/usr/bin/env python
"""
Parse a HAR (HTTP Archive) and return URLs which do not match specific domain
HAR Spec: http://groups.google.com/group/http-archive-specification/web/har-1-2-spec
HAR can be saved, for example, from Chrome Developer Tools
Copyleft 2012 Andrei Kost <kost@kost.ru>
based Ian Gallagher <crash@neg9.org> script https://gist.github.com/892479
Example usage: ./har_request_urls.py foo.har
"""

import json, urlparse

except_domain = 'example.com' # domain to exclude

if '__main__' == __name__:
    import sys


    har_file = sys.argv[1]

    # Read HAR archive (skip over binary header if present - Fiddler2 exports contain this)
    har_data = open(har_file, 'rb').read()
    skip = 3 if '\xef\xbb\xbf' == har_data[:3] else 0

    har = json.loads(har_data[skip:])

    matching_entries = filter(lambda x: except_domain != urlparse.urlparse(x['request']['url']).hostname, har['log']['entries'])
    matching_urls = set(map(lambda x: x['request']['url'], matching_entries))


    for url in sorted(matching_urls):
        print url
	#!/usr/bin/env python
	"""
	Parse a HAR (HTTP Archive) and return URLs which do not match specific domain
	HAR Spec: http://groups.google.com/group/http-archive-specification/web/har-1-2-spec
	HAR can be saved, for example, from Chrome Developer Tools
	Copyleft 2012 Andrei Kost <kost@kost.ru>
	based Ian Gallagher <crash@neg9.org> script https://gist.github.com/892479
	Example usage: ./har_request_urls.py foo.har
	"""

	import json, urlparse

	except_domain = 'example.com' # domain to exclude

	if '__main__' == __name__:
	import sys


	har_file = sys.argv[1]

	# Read HAR archive (skip over binary header if present - Fiddler2 exports contain this)
	har_data = open(har_file, 'rb').read()
	skip = 3 if '\xef\xbb\xbf' == har_data[:3] else 0

	har = json.loads(har_data[skip:])

	matching_entries = filter(lambda x: except_domain != urlparse.urlparse(x['request']['url']).hostname, har['log']['entries'])
	matching_urls = set(map(lambda x: x['request']['url'], matching_entries))


	for url in sorted(matching_urls):
	print url