myleott/extract_common_crawl.py

## extract_common_crawl.py
#!/usr/bin/env python

import argparse
import gzip
import json
import requests
from io import BytesIO, StringIO
from urllib.parse import urlencode


def get_html(url, index='CC-MAIN-2018-26'):
    # Let's fetch the Common Crawl FAQ using the CC index
    url = urlencode({'url': url})
    resp = requests.get('http://index.commoncrawl.org/{}-index?output=json&{}'.format(index, url))
    pages = [json.loads(x) for x in resp.content.decode('utf-8').strip().split('\n')]

    # Multiple pages may have been found - we're only interested in one
    page = pages[0]

    # We need to calculate the start and the end of the relevant byte range
    # (each WARC file is composed of many small GZIP files stuck together)
    offset, length = int(page['offset']), int(page['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    # We can then use the Range header to ask for just this set of bytes
    resp = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

    # The page is stored compressed (gzip) to save space
    # We can extract it using the GZIP library
    raw_data = BytesIO(resp.content)
    f = gzip.GzipFile(fileobj=raw_data)

    # What we have now is just the WARC response, formatted:
    return f.read().decode('utf-8')


def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('url')
    args = parser.parse_args()
    print(get_html(args.url))


if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	import argparse
	import gzip
	import json
	import requests
	from io import BytesIO, StringIO
	from urllib.parse import urlencode


	def get_html(url, index='CC-MAIN-2018-26'):
	# Let's fetch the Common Crawl FAQ using the CC index
	url = urlencode({'url': url})
	resp = requests.get('http://index.commoncrawl.org/{}-index?output=json&{}'.format(index, url))
	pages = [json.loads(x) for x in resp.content.decode('utf-8').strip().split('\n')]

	# Multiple pages may have been found - we're only interested in one
	page = pages[0]

	# We need to calculate the start and the end of the relevant byte range
	# (each WARC file is composed of many small GZIP files stuck together)
	offset, length = int(page['offset']), int(page['length'])
	offset_end = offset + length - 1

	# We'll get the file via HTTPS so we don't need to worry about S3 credentials
	# Getting the file on S3 is equivalent however - you can request a Range
	prefix = 'https://commoncrawl.s3.amazonaws.com/'
	# We can then use the Range header to ask for just this set of bytes
	resp = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

	# The page is stored compressed (gzip) to save space
	# We can extract it using the GZIP library
	raw_data = BytesIO(resp.content)
	f = gzip.GzipFile(fileobj=raw_data)

	# What we have now is just the WARC response, formatted:
	return f.read().decode('utf-8')


	def main():
	parser = argparse.ArgumentParser(description='')
	parser.add_argument('url')
	args = parser.parse_args()
	print(get_html(args.url))


	if __name__ == '__main__':
	main()