Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Forked from ralphbean/flickr-scraper.py
Last active August 7, 2021 11:30
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nmoinvaz/92e1be6c2ba25de009f7593dcf0229cd to your computer and use it in GitHub Desktop.
Save nmoinvaz/92e1be6c2ba25de009f7593dcf0229cd to your computer and use it in GitHub Desktop.
Script to scrape images from a flickr account.
#!/usr/bin/env python
""" Script to scrape images from a flickr account.
Author: Ralph Bean <rbean@redhat.com>
Added ability to get specific image sizes
Added thread pooling for faster retrieval of large sets of images
Added flickr api request caching for faster recovery
Author: Nathan Moinvaziri <nathan@nathanm.com>
"""
import ConfigParser
import urllib
import requests
import os
import time
import hashlib
import codecs
import json
import collections
from concurrent.futures import ThreadPoolExecutor
config = ConfigParser.ConfigParser()
config.read(['flickr.ini', '/etc/flickr.ini'])
def read_json(path):
data = {}
with codecs.open(path, 'r', 'utf-8') as fin:
data = json.load(fin, object_pairs_hook=collections.OrderedDict)
return data
def write_json(path, data):
with codecs.open(path, 'w', 'utf-8') as fout:
fout.write(json.dumps(data, ensure_ascii=False, indent=4))
def flickr_request(**kwargs):
flickr_api_key = config.get('general', 'flickr_api_key')
flickr_url = config.get('general', 'flickr_url')
# Get sha1 to save response in cache
url_params = dict(
api_key=flickr_api_key,
format='json',
nojsoncallback=1,
**kwargs)
sha_1 = hashlib.sha1()
sha_1.update(flickr_url)
sha_1.update(json.dumps(url_params))
if not os.path.exists('cache'):
os.makedirs('cache')
cache_path = os.path.join('cache', sha_1.hexdigest())
if not os.path.exists(cache_path):
response = requests.get(flickr_url, params=url_params)
result = response.json()
if result['stat'] != 'ok':
print result['stat']
exit
write_json(cache_path, result)
else:
result = read_json(cache_path)
return result
def get_flickr_photos_by_person(nsid, page=1):
# https://secure.flickr.com/services/api/flickr.people.getPhotos.html
print 'Getting list of photos - {0} (Page {1})'.format(nsid, page)
return flickr_request(
method='flickr.people.getPhotos',
user_id=nsid,
content_type=1, # photos only
page=page,
per_page=500
)
def get_flickr_photo_size(photo, size):
print 'Getting photo size - {0} ({1})'.format(photo['id'], photo['title'])
# https://secure.flickr.com/services/api/flickr.photos.getSizes.html
d = flickr_request(
method='flickr.photos.getSizes',
photo_id=photo['id']
)
for s in d['sizes']['size']:
if s['label'].lower() == size:
return s
return None
def get_photos_for_person(nsid):
a = get_flickr_photos_by_person(nsid)['photos']
# Step backwards through the pictures
photos = a['photo']
for page in range(a['pages'], 1, -1):
d = get_flickr_photos_by_person(nsid, page=page)
photos.extend(d['photos']['photo'])
return photos
def download_flickr_photo(photo, size):
# Get the correct size for the photo
print 'Processing photo - {0} ({1})'.format(photo['id'], photo['title'])
photo_size = get_flickr_photo_size(photo, size)
if photo_size is None:
return
# Construct url and local output path
url = photo_size['source']
url_path, url_ext = os.path.splitext(url)
output = config.get('general', 'output_dir')
local = os.path.join(output, photo['title'] + url_ext)
if not os.path.exists(local) or os.path.getsize(local) == 0:
print '* Saving url {0}\n as {1}'.format(url, local)
urllib.urlretrieve(url, local)
def main():
# https://www.webpagefx.com/tools/idgettr/
nsid = config.get('general', 'nsid')
output = config.get('general', 'output_dir')
if not os.path.exists(output):
os.makedirs(output)
output_size = config.get('general', 'output_size')
workers = config.getint('general', 'max_workers')
# First get all photos
photos = get_photos_for_person(nsid)
print 'Retrieved {0} photos for {1}'.format(len(photos), nsid)
# Launch thread pool for quickest download
with ThreadPoolExecutor(max_workers=workers) as executor:
for photo in photos:
executor.submit(download_flickr_photo, photo, output_size)
if __name__ == '__main__':
main()
[general]
flickr_url=https://api.flickr.com/services/rest/
flickr_api_key=put-your-api-key-here
output_dir=images
cache_dir=cache
output_size=original
nsid=put-nsid-for-user-account
max_workers=8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment