nmoinvaz/flickr-scraper.py

## flickr-scraper.py
#!/usr/bin/env python
""" Script to scrape images from a flickr account.
Author: Ralph Bean <rbean@redhat.com>

Added ability to get specific image sizes
Added thread pooling for faster retrieval of large sets of images
Added flickr api request caching for faster recovery
Author: Nathan Moinvaziri <nathan@nathanm.com>
"""

import ConfigParser
import urllib
import requests
import os
import time
import hashlib
import codecs
import json
import collections
from concurrent.futures import ThreadPoolExecutor

config = ConfigParser.ConfigParser()
config.read(['flickr.ini', '/etc/flickr.ini'])

def read_json(path):
    data = {}
    with codecs.open(path, 'r', 'utf-8') as fin:
        data = json.load(fin, object_pairs_hook=collections.OrderedDict)
    return data

def write_json(path, data):
    with codecs.open(path, 'w', 'utf-8') as fout:
        fout.write(json.dumps(data, ensure_ascii=False, indent=4))

def flickr_request(**kwargs):
    flickr_api_key = config.get('general', 'flickr_api_key')
    flickr_url = config.get('general', 'flickr_url')

    # Get sha1 to save response in cache
    url_params = dict(
        api_key=flickr_api_key,
        format='json',
        nojsoncallback=1,
        **kwargs)

    sha_1 = hashlib.sha1()
    sha_1.update(flickr_url)
    sha_1.update(json.dumps(url_params))

    if not os.path.exists('cache'):
        os.makedirs('cache')

    cache_path = os.path.join('cache', sha_1.hexdigest())
    if not os.path.exists(cache_path):
        response = requests.get(flickr_url, params=url_params)

        result = response.json()
        if result['stat'] != 'ok':
            print result['stat']
            exit
        write_json(cache_path, result)
    else:
        result = read_json(cache_path)
    return result

def get_flickr_photos_by_person(nsid, page=1):
    # https://secure.flickr.com/services/api/flickr.people.getPhotos.html
    print 'Getting list of photos - {0} (Page {1})'.format(nsid, page)
    return flickr_request(
        method='flickr.people.getPhotos',
        user_id=nsid,
        content_type=1,  # photos only
        page=page,
        per_page=500
    )

def get_flickr_photo_size(photo, size):
    print 'Getting photo size - {0} ({1})'.format(photo['id'], photo['title'])
    # https://secure.flickr.com/services/api/flickr.photos.getSizes.html
    d = flickr_request(
        method='flickr.photos.getSizes',
        photo_id=photo['id']
    )
    for s in d['sizes']['size']:
        if s['label'].lower() == size:
            return s
    return None

def get_photos_for_person(nsid):
    a = get_flickr_photos_by_person(nsid)['photos']
    # Step backwards through the pictures
    photos = a['photo']
    for page in range(a['pages'], 1, -1):
        d = get_flickr_photos_by_person(nsid, page=page)
        photos.extend(d['photos']['photo'])
    return photos

def download_flickr_photo(photo, size):
    # Get the correct size for the photo
    print 'Processing photo - {0} ({1})'.format(photo['id'], photo['title'])
    photo_size = get_flickr_photo_size(photo, size)
    if photo_size is None:
        return

    # Construct url and local output path
    url = photo_size['source']
    url_path, url_ext = os.path.splitext(url)

    output = config.get('general', 'output_dir')
    local = os.path.join(output, photo['title'] + url_ext)

    if not os.path.exists(local) or os.path.getsize(local) == 0:
        print '* Saving url {0}\n     as {1}'.format(url, local)
        urllib.urlretrieve(url, local)

def main():

    # https://www.webpagefx.com/tools/idgettr/
    nsid = config.get('general', 'nsid')

    output = config.get('general', 'output_dir')
    if not os.path.exists(output):
        os.makedirs(output)
    output_size = config.get('general', 'output_size')
    workers = config.getint('general', 'max_workers')

    # First get all photos
    photos = get_photos_for_person(nsid)
    print 'Retrieved {0} photos for {1}'.format(len(photos), nsid)
    # Launch thread pool for quickest download
    with ThreadPoolExecutor(max_workers=workers) as executor:
        for photo in photos:
            executor.submit(download_flickr_photo, photo, output_size)

if __name__ == '__main__':
    main()

## flickr.ini
[general]
flickr_url=https://api.flickr.com/services/rest/
flickr_api_key=put-your-api-key-here
output_dir=images
cache_dir=cache
output_size=original
nsid=put-nsid-for-user-account
max_workers=8
	#!/usr/bin/env python
	""" Script to scrape images from a flickr account.
	Author: Ralph Bean <rbean@redhat.com>

	Added ability to get specific image sizes
	Added thread pooling for faster retrieval of large sets of images
	Added flickr api request caching for faster recovery
	Author: Nathan Moinvaziri <nathan@nathanm.com>
	"""

	import ConfigParser
	import urllib
	import requests
	import os
	import time
	import hashlib
	import codecs
	import json
	import collections
	from concurrent.futures import ThreadPoolExecutor

	config = ConfigParser.ConfigParser()
	config.read(['flickr.ini', '/etc/flickr.ini'])

	def read_json(path):
	data = {}
	with codecs.open(path, 'r', 'utf-8') as fin:
	data = json.load(fin, object_pairs_hook=collections.OrderedDict)
	return data

	def write_json(path, data):
	with codecs.open(path, 'w', 'utf-8') as fout:
	fout.write(json.dumps(data, ensure_ascii=False, indent=4))

	def flickr_request(**kwargs):
	flickr_api_key = config.get('general', 'flickr_api_key')
	flickr_url = config.get('general', 'flickr_url')

	# Get sha1 to save response in cache
	url_params = dict(
	api_key=flickr_api_key,
	format='json',
	nojsoncallback=1,
	**kwargs)

	sha_1 = hashlib.sha1()
	sha_1.update(flickr_url)
	sha_1.update(json.dumps(url_params))

	if not os.path.exists('cache'):
	os.makedirs('cache')

	cache_path = os.path.join('cache', sha_1.hexdigest())
	if not os.path.exists(cache_path):
	response = requests.get(flickr_url, params=url_params)

	result = response.json()
	if result['stat'] != 'ok':
	print result['stat']
	exit
	write_json(cache_path, result)
	else:
	result = read_json(cache_path)
	return result

	def get_flickr_photos_by_person(nsid, page=1):
	# https://secure.flickr.com/services/api/flickr.people.getPhotos.html
	print 'Getting list of photos - {0} (Page {1})'.format(nsid, page)
	return flickr_request(
	method='flickr.people.getPhotos',
	user_id=nsid,
	content_type=1, # photos only
	page=page,
	per_page=500
	)

	def get_flickr_photo_size(photo, size):
	print 'Getting photo size - {0} ({1})'.format(photo['id'], photo['title'])
	# https://secure.flickr.com/services/api/flickr.photos.getSizes.html
	d = flickr_request(
	method='flickr.photos.getSizes',
	photo_id=photo['id']
	)
	for s in d['sizes']['size']:
	if s['label'].lower() == size:
	return s
	return None

	def get_photos_for_person(nsid):
	a = get_flickr_photos_by_person(nsid)['photos']
	# Step backwards through the pictures
	photos = a['photo']
	for page in range(a['pages'], 1, -1):
	d = get_flickr_photos_by_person(nsid, page=page)
	photos.extend(d['photos']['photo'])
	return photos

	def download_flickr_photo(photo, size):
	# Get the correct size for the photo
	print 'Processing photo - {0} ({1})'.format(photo['id'], photo['title'])
	photo_size = get_flickr_photo_size(photo, size)
	if photo_size is None:
	return

	# Construct url and local output path
	url = photo_size['source']
	url_path, url_ext = os.path.splitext(url)

	output = config.get('general', 'output_dir')
	local = os.path.join(output, photo['title'] + url_ext)

	if not os.path.exists(local) or os.path.getsize(local) == 0:
	print '* Saving url {0}\n as {1}'.format(url, local)
	urllib.urlretrieve(url, local)

	def main():

	# https://www.webpagefx.com/tools/idgettr/
	nsid = config.get('general', 'nsid')

	output = config.get('general', 'output_dir')
	if not os.path.exists(output):
	os.makedirs(output)
	output_size = config.get('general', 'output_size')
	workers = config.getint('general', 'max_workers')

	# First get all photos
	photos = get_photos_for_person(nsid)
	print 'Retrieved {0} photos for {1}'.format(len(photos), nsid)
	# Launch thread pool for quickest download
	with ThreadPoolExecutor(max_workers=workers) as executor:
	for photo in photos:
	executor.submit(download_flickr_photo, photo, output_size)

	if __name__ == '__main__':
	main()
	[general]
	flickr_url=https://api.flickr.com/services/rest/
	flickr_api_key=put-your-api-key-here
	output_dir=images
	cache_dir=cache
	output_size=original
	nsid=put-nsid-for-user-account
	max_workers=8