Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python script to export Common Crawl WARC records found via CDX to a file named my.warc.gz: `zgrep '...pattern...' cdx-*.gz | python3 cdx_get_warc_record.py >my.warc.gz`
import fileinput
import sys
import boto3
import botocore
import ujson as json
no_sign_request = botocore.client.Config(
signature_version=botocore.UNSIGNED)
s3client = boto3.client('s3', config=no_sign_request)
bucketname = 'commoncrawl'
warc_output = open('get.warc.gz', mode='w+b')
def get_record(line):
parts = line.split(' ')
json_string = ' '.join(parts[2:])
try:
metadata = json.loads(json_string)
except:
return
try:
path = metadata['filename']
offset = int(metadata['offset'])
length = int(metadata['length'])
rangereq = 'bytes={}-{}'.format(offset, (offset+length-1))
sys.stderr.write('Get ' + rangereq + ' from ' + path + '\n')
response = s3client.get_object(Bucket=bucketname,
Key=path,
Range=rangereq)
warc_output.write(response["Body"].read())
except botocore.client.ClientError as exception:
sys.stderr.write('Failed to download {}: {}\n'.format(
metadata['filename'], exception))
for line in fileinput.input():
line = line.replace('\n', '')
get_record(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment