Skip to content

Instantly share code, notes, and snippets.

@j-bennet
Forked from msukmanowsky/get_urls.txt
Last active October 25, 2016 23:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j-bennet/bfeb2fe758914c2ddfb570c690610857 to your computer and use it in GitHub Desktop.
Save j-bennet/bfeb2fe758914c2ddfb570c690610857 to your computer and use it in GitHub Desktop.
Download list of crawl urls and write to file "urls.txt"
#!/usr/bin/env python
'''Build up a set of URLs using the common crawl index. See
http://commoncrawl.org/2015/04/announcing-the-common-crawl-index/ for more info.
'''
from __future__ import print_function
import gzip
import logging
import os
import random
import boto3
log = logging.getLogger('urlutils.profiling.get_urls')
_here = lambda *paths: os.path.join(os.path.dirname(os.path.abspath(__file__)), *paths)
def get_common_crawl_urls(week='2016-07', max_urls=10000000):
num_urls = 0
bucket = boto3.resource('s3').Bucket('commoncrawl')
objects = bucket.objects.filter(Prefix='cc-index/collections/CC-MAIN-{}/indexes/'.format(week))
objects = [o for o in objects if o.key.endswith('.gz')]
# Common Crawl URLS are alphabetically sorted so we don't want to grab only
# stuff like http://69.30.227.140/showthread.php?tid=35992
objects = random.sample(objects, 3)
for object_ in objects:
filename = _here(os.path.basename(object_.key))
if not os.path.exists(filename):
log.info('Downloading common crawl index file %s to %s', object_.key, filename)
bucket.download_file(object_.key, filename)
log.info('Downloaded %s to %s', object_.key, filename)
with gzip.open(filename) as fp:
for line in fp:
if num_urls == max_urls:
break
yield line.split(' ')[3][1:-2]
num_urls += 1
os.unlink(filename)
if num_urls == max_urls:
break
logging.basicConfig(level=logging.INFO)
[logging.getLogger(l).setLevel(logging.WARN) for l in ('boto3', 'botocore')]
filename = _here('urls.txt')
max_urls = 10000000
log.info('Writing {:,} URLs to %s'.format(max_urls), filename)
with open(filename, 'w') as fp:
for i, url in enumerate(get_common_crawl_urls()):
print(url, file=fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment