j-bennet/get_urls.py

## get_urls.py
#!/usr/bin/env python
'''Build up a set of URLs using the common crawl index. See
http://commoncrawl.org/2015/04/announcing-the-common-crawl-index/ for more info.
'''
from __future__ import print_function
import gzip
import logging
import os
import random

import boto3

log = logging.getLogger('urlutils.profiling.get_urls')

_here = lambda *paths: os.path.join(os.path.dirname(os.path.abspath(__file__)), *paths)

def get_common_crawl_urls(week='2016-07', max_urls=10000000):
    num_urls = 0
    bucket = boto3.resource('s3').Bucket('commoncrawl')
    objects = bucket.objects.filter(Prefix='cc-index/collections/CC-MAIN-{}/indexes/'.format(week))
    objects = [o for o in objects if o.key.endswith('.gz')]
    # Common Crawl URLS are alphabetically sorted so we don't want to grab only
    # stuff like http://69.30.227.140/showthread.php?tid=35992
    objects = random.sample(objects, 3)
    for object_ in objects:
        filename = _here(os.path.basename(object_.key))
        if not os.path.exists(filename):
            log.info('Downloading common crawl index file %s to %s', object_.key, filename)
            bucket.download_file(object_.key, filename)
            log.info('Downloaded %s to %s', object_.key, filename)
        with gzip.open(filename) as fp:
            for line in fp:
                if num_urls == max_urls:
                    break

                yield line.split(' ')[3][1:-2]
                num_urls += 1

        os.unlink(filename)
        if num_urls == max_urls:
            break

logging.basicConfig(level=logging.INFO)
[logging.getLogger(l).setLevel(logging.WARN) for l in ('boto3', 'botocore')]
filename = _here('urls.txt')
max_urls = 10000000
log.info('Writing {:,} URLs to %s'.format(max_urls), filename)
with open(filename, 'w') as fp:
    for i, url in enumerate(get_common_crawl_urls()):
        print(url, file=fp)
	#!/usr/bin/env python
	'''Build up a set of URLs using the common crawl index. See
	http://commoncrawl.org/2015/04/announcing-the-common-crawl-index/ for more info.
	'''
	from __future__ import print_function
	import gzip
	import logging
	import os
	import random

	import boto3

	log = logging.getLogger('urlutils.profiling.get_urls')

	_here = lambda paths: os.path.join(os.path.dirname(os.path.abspath(__file__)), paths)

	def get_common_crawl_urls(week='2016-07', max_urls=10000000):
	num_urls = 0
	bucket = boto3.resource('s3').Bucket('commoncrawl')
	objects = bucket.objects.filter(Prefix='cc-index/collections/CC-MAIN-{}/indexes/'.format(week))
	objects = [o for o in objects if o.key.endswith('.gz')]
	# Common Crawl URLS are alphabetically sorted so we don't want to grab only
	# stuff like http://69.30.227.140/showthread.php?tid=35992
	objects = random.sample(objects, 3)
	for object_ in objects:
	filename = _here(os.path.basename(object_.key))
	if not os.path.exists(filename):
	log.info('Downloading common crawl index file %s to %s', object_.key, filename)
	bucket.download_file(object_.key, filename)
	log.info('Downloaded %s to %s', object_.key, filename)
	with gzip.open(filename) as fp:
	for line in fp:
	if num_urls == max_urls:
	break

	yield line.split(' ')[3][1:-2]
	num_urls += 1

	os.unlink(filename)
	if num_urls == max_urls:
	break

	logging.basicConfig(level=logging.INFO)
	[logging.getLogger(l).setLevel(logging.WARN) for l in ('boto3', 'botocore')]
	filename = _here('urls.txt')
	max_urls = 10000000
	log.info('Writing {:,} URLs to %s'.format(max_urls), filename)
	with open(filename, 'w') as fp:
	for i, url in enumerate(get_common_crawl_urls()):
	print(url, file=fp)