Skip to content

Instantly share code, notes, and snippets.

@myleott
Created August 8, 2018 20:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save myleott/1b7ed1252c5c5fd50280953c9ab16e62 to your computer and use it in GitHub Desktop.
Save myleott/1b7ed1252c5c5fd50280953c9ab16e62 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import argparse
import gzip
import json
import requests
from io import BytesIO, StringIO
from urllib.parse import urlencode
def get_html(url, index='CC-MAIN-2018-26'):
# Let's fetch the Common Crawl FAQ using the CC index
url = urlencode({'url': url})
resp = requests.get('http://index.commoncrawl.org/{}-index?output=json&{}'.format(index, url))
pages = [json.loads(x) for x in resp.content.decode('utf-8').strip().split('\n')]
# Multiple pages may have been found - we're only interested in one
page = pages[0]
# We need to calculate the start and the end of the relevant byte range
# (each WARC file is composed of many small GZIP files stuck together)
offset, length = int(page['offset']), int(page['length'])
offset_end = offset + length - 1
# We'll get the file via HTTPS so we don't need to worry about S3 credentials
# Getting the file on S3 is equivalent however - you can request a Range
prefix = 'https://commoncrawl.s3.amazonaws.com/'
# We can then use the Range header to ask for just this set of bytes
resp = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
# The page is stored compressed (gzip) to save space
# We can extract it using the GZIP library
raw_data = BytesIO(resp.content)
f = gzip.GzipFile(fileobj=raw_data)
# What we have now is just the WARC response, formatted:
return f.read().decode('utf-8')
def main():
parser = argparse.ArgumentParser(description='')
parser.add_argument('url')
args = parser.parse_args()
print(get_html(args.url))
if __name__ == '__main__':
main()
@myleott
Copy link
Author

myleott commented Aug 8, 2018

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment