Skip to content

Instantly share code, notes, and snippets.

@edsu
Created April 14, 2016 19:07
Show Gist options
  • Save edsu/6b3ec82cb9e155b4521efbdfdd436da1 to your computer and use it in GitHub Desktop.
Save edsu/6b3ec82cb9e155b4521efbdfdd436da1 to your computer and use it in GitHub Desktop.
import warc
from StringIO import StringIO
from httplib import HTTPResponse
class FakeSocket():
def __init__(self, response_str):
self._file = StringIO(response_str)
def makefile(self, *args, **kwargs):
return self._file
for record in warc.open("eada.warc.gz"):
if record.type == "response":
resp = HTTPResponse(FakeSocket(record.payload.read()))
resp.begin()
if resp.getheader("content-type") == "text/html":
print record['WARC-Target-URI']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment