Skip to content

Instantly share code, notes, and snippets.

@johngian
Created October 20, 2015 14:38
Show Gist options
  • Save johngian/c3ddb784a35e7bf377c2 to your computer and use it in GitHub Desktop.
Save johngian/c3ddb784a35e7bf377c2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import click
import os
import requests
import tempfile
from itertools import izip, repeat
from multiprocessing import Pool
def get_data(url, api_key, data_dir):
try:
headers = {
'X-API-KEY': api_key
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
for entry in data['results']:
user_url = entry['_url']
user_response = requests.get(user_url, headers=headers)
if user_response.status_code == 200:
filename = user_url.rsplit('/', 2)[-2]
filepath = os.path.join(data_dir, filename)
with open(filepath, 'w') as f:
f.write(user_response.content)
except KeyboardInterrupt:
raise Exception()
def get_data_star(args):
return get_data(*args)
@click.command()
@click.option('--pool_size', default=10, help='Number of processes.')
@click.option('--api_key', required=True, help='Mozillians.org API key.')
@click.option('--api_url', default='https://mozillians.org/api/v2/users/',
help='Mozillians.org API endpoint.')
@click.option('--data_dir', default=tempfile.mkdtemp(prefix='mozillians'),
help='Output directory.')
def scrap(pool_size, api_key, api_url, data_dir):
if not os.path.exists(data_dir):
os.makedirs(data_dir)
pool = Pool(pool_size)
base_url = '{0}?page='.format(api_url)
urls = ['{0}{1}'.format(base_url, page) for page in range(1, 1242)]
arguments = izip(urls, repeat(api_key), repeat(data_dir))
click.echo('POOL_SIZE: {}'.format(pool_size))
click.echo('DATA_DIR: {}'.format(data_dir))
try:
click.echo('Launching GET processes...')
pool.map(get_data_star, arguments)
pool.close()
click.echo('Scraping complete!')
except KeyboardInterrupt:
click.echo('Terminating process pool. ')
pool.terminate()
click.echo('Pool terminated.')
except Exception, e:
click.echo('Got exception: %r, terminating the pool' % (e,))
pool.terminate()
click.echo('Pool is terminated')
finally:
click.echo('Joining pool processes.')
pool.join()
click.echo('Join complete!')
click.echo('Bye bye!')
if __name__ == '__main__':
scrap()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment