Created
August 8, 2018 20:09
-
-
Save myleott/1b7ed1252c5c5fd50280953c9ab16e62 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import gzip | |
import json | |
import requests | |
from io import BytesIO, StringIO | |
from urllib.parse import urlencode | |
def get_html(url, index='CC-MAIN-2018-26'): | |
# Let's fetch the Common Crawl FAQ using the CC index | |
url = urlencode({'url': url}) | |
resp = requests.get('http://index.commoncrawl.org/{}-index?output=json&{}'.format(index, url)) | |
pages = [json.loads(x) for x in resp.content.decode('utf-8').strip().split('\n')] | |
# Multiple pages may have been found - we're only interested in one | |
page = pages[0] | |
# We need to calculate the start and the end of the relevant byte range | |
# (each WARC file is composed of many small GZIP files stuck together) | |
offset, length = int(page['offset']), int(page['length']) | |
offset_end = offset + length - 1 | |
# We'll get the file via HTTPS so we don't need to worry about S3 credentials | |
# Getting the file on S3 is equivalent however - you can request a Range | |
prefix = 'https://commoncrawl.s3.amazonaws.com/' | |
# We can then use the Range header to ask for just this set of bytes | |
resp = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)}) | |
# The page is stored compressed (gzip) to save space | |
# We can extract it using the GZIP library | |
raw_data = BytesIO(resp.content) | |
f = gzip.GzipFile(fileobj=raw_data) | |
# What we have now is just the WARC response, formatted: | |
return f.read().decode('utf-8') | |
def main(): | |
parser = argparse.ArgumentParser(description='') | |
parser.add_argument('url') | |
args = parser.parse_args() | |
print(get_html(args.url)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Based heavily on: https://gist.github.com/Smerity/56bc6f21a8adec920ebf