Aug 19, 2016

Count the number of responses per origin in Chrome's HTTP cache.
#!/usr/bin/env python
Returns the number of responses for each origin in Chrome's HTTP
cache (fresh or stale).
Run with STDIN from the results of saving (as HTML, *not* a Web Archive):
from collections import defaultdict
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
from urlparse import urlsplit
port_lookup = {
'https': 443,
'http': 80
class MyHTMLParser(HTMLParser):
def __init__(self):
self.in_a = False = '' = defaultdict(int)
def handle_starttag(self, tag, attrs):
if tag == "a":
self.in_a = True = ''
def handle_endtag(self, tag):
url = urlsplit(
if not url.port:
port = port_lookup.get(url.scheme, None)
origin = "(%s %s %s)" % (url.scheme, url.hostname, port)[origin] += 1
self.in_a = False
def handle_data(self, data):
if self.in_a: += data
def handle_entityref(self, name):
c = unichr(name2codepoint[name]) += c
if __name__ == "__main__":
import sys
parser = MyHTMLParser()
for line in sys.stdin.readlines():
for origin, count in
print count

dcarley commented Aug 19, 2016

Gave an exception and traceback:

Traceback (most recent call last):
  File "", line 53, in <module>
  File "/opt/boxen/homebrew/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/", line 117, in feed
  File "/opt/boxen/homebrew/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/", line 163, in goahead
    k = self.parse_endtag(i)
  File "/opt/boxen/homebrew/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/", line 401, in parse_endtag
  File "", line 37, in handle_endtag
    origin = "(%s %s %s)" % (url.scheme, url.hostname, port)
UnboundLocalError: local variable 'port' referenced before assignment

Should it be using and redefining url.port instead of port?

