Skip to content

Instantly share code, notes, and snippets.

@ericharley
Created November 9, 2018 20:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericharley/67055555821568308c1bbaa75430cf4f to your computer and use it in GitHub Desktop.
Save ericharley/67055555821568308c1bbaa75430cf4f to your computer and use it in GitHub Desktop.
python for common crawl
import csv
import gzip
import requests
from StringIO import StringIO
# Parameters
prefix = 'https://commoncrawl.s3.amazonaws.com/'
fileout_extension = "pdf"
def get_file(warc_filename, warc_record_offset, warc_record_length, content_digest):
# compute request parameters
url = prefix + warc_filename
# Each WARC file is composed of many gzip files concatenated together
# Calculate the start and the end of the relevant byte range
offset = int(warc_record_offset)
length = int(warc_record_length)
offset_end = offset + length - 1
# Use the Range HTTP header to specify the set of bytes
resp = requests.get(url, headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
# decompress the returned WARC response
raw_data = StringIO(resp.content)
f = gzip.GzipFile(fileobj=raw_data)
data = f.read()
warc, header, response = data.strip().split('\r\n\r\n', 2)
return response
with open('files_cc_1k.csv', 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
response = get_file(row['warc_filename'], row['warc_record_offset'], row['warc_record_length'], row['content_digest'])
# Write out file to disk
output_filename = row['content_digest'] + "." + fileout_extension
output_file = open(output_filename, "wb")
output_file.write(response)
output_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment