Skip to content

Instantly share code, notes, and snippets.

@koteq
Forked from Apkawa/album_fetcher.py
Created December 10, 2011 18:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save koteq/1455886 to your computer and use it in GitHub Desktop.
Save koteq/1455886 to your computer and use it in GitHub Desktop.
Fetch google+ album
# -*- coding: utf-8 -*-
"""
Usage:
python album_fetcher.py https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217
python album_fetcher.py https://plus.google.com/118353143366443526186
python album_fetcher.py https://plus.google.com/118353143366443526186 youremail@gmail.com yourpassword
python album_fetcher.py https://plus.google.com/118353143366443526186 youremail@gmail.com yourpassword /out_dir/
TODO: use opt parse
"""
import os
import re
import sys
import urllib
import logging
import logging.handlers
from Queue import Queue
from threading import Thread
from collections import namedtuple
import gdata.photos.service
WINDOWS = os.name == 'nt'
if WINDOWS:
from unidecode import unidecode
DownloadTask = namedtuple('DownloadTask', ['url', 'save_path'])
ResultParseUrl = namedtuple('ResultParseUrl', ['user_id', 'album_id'])
DOWNLOAD_THREADS_COUNT = 30
ALBUM_FETCHER_THREADS_COUNT = 10
DOWNLOAD_FULL_SIZED_IMAGES = True
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
ALBUMS_SAVE_DIR = os.path.join(PROJECT_ROOT, 'fetch_albums')
log = logging.getLogger('album_fetcher')
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%H:%M:%S')
class Fetcher(object):
def __init__(self, user_id, album_id, email=None, password=None, save_dir=None):
self.user_id = user_id
self.album_id = album_id
self.email = email
self.password = password
self.save_dir = save_dir or ALBUMS_SAVE_DIR
self._gd_client = None
self._download_queue = None
self._album_fetch_queue = None
def _strip(self, string):
if WINDOWS:
# transliterate utf string and strip illegal chars
string = unidecode(string.decode('utf8'))
string = re.sub(r'[\\/:"*?<>|]+', '', string)
string = string.decode('latin-1', errors='ignore').encode('latin-1')
return string.strip()
def _download_worker(self):
while True:
task = self._download_queue.get() # wait for item
if 1:
log.info('GET %s', task.url)
try:
output_file = open(task.save_path, "wb")
input_data = urllib.urlopen(task.url)
output_file.write(input_data.read())
output_file.close()
input_data.close()
except Exception:
log.exception()
pass
self._download_queue.task_done()
def _add_download_task(self, url, save_path):
if self._download_queue is None:
self._download_queue = Queue()
for i in range(DOWNLOAD_THREADS_COUNT):
t = Thread(target=self._download_worker)
t.daemon = True
t.start()
self._download_queue.put(DownloadTask(url, save_path))
def _album_fetch_worker(self):
while True:
album_id = self._album_fetch_queue.get() # wait for item
log.info("fetching album id: %s", album_id)
feed = self._gd_client.GetFeed(
'/data/feed/api/user/%s/albumid/%s?kind=photo' % (self.user_id, album_id))
out_dir = self._get_album_out_dir(feed)
for filename, src_url in self._get_all_content_links(feed).iteritems():
if WINDOWS:
filename, ext = os.path.splitext(filename)
filename = filename[:32] + ext # TODO: where is can be same file names
filepath = os.path.join(out_dir, filename)
if self.file_is_exists(filepath):
continue
if DOWNLOAD_FULL_SIZED_IMAGES:
src_url = re.sub(r'(.*)/', r'\1/s0-d/', src_url)
self._add_download_task(src_url, filepath)
self._album_fetch_queue.task_done()
def _add_album_fetch_task(self, album_id):
if self._album_fetch_queue is None:
self._album_fetch_queue = Queue()
for i in range(ALBUM_FETCHER_THREADS_COUNT):
t = Thread(target=self._album_fetch_worker)
t.daemon = True
t.start()
self._album_fetch_queue.put(album_id)
def _gd_auth(self):
if self._gd_client is not None:
return self._gd_client
self._gd_client = gdata.photos.service.PhotosService()
if self.email and self.password:
self._gd_client.email = self.email
self._gd_client.password = self.password
self._gd_client.ProgrammaticLogin()
return self._gd_client
def _get_album_out_dir(self, feed):
"""
Create non exist directories and return album save path
"""
user_name = self._strip(feed.nickname.text)
album_name = None
try:
# this hack allow us to group many dummy albums from stream into one
for extension_element in feed.extension_elements:
if extension_element.tag == 'albumType':
album_name = extension_element.text
except AttributeError:
pass
if album_name is None:
album_name = self._strip(feed.title.text)
out_dir = os.path.join(self.save_dir, user_name, album_name)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# TODO: user can rename himself
touch_file = os.path.join(self.save_dir, user_name, self.user_id)
try:
if not os.path.exists(touch_file):
file(touch_file, 'a').close()
os.utime(touch_file, None)
except IOError:
pass
return out_dir
@staticmethod
def parse_album_url(url):
"""
https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217
https://plus.google.com/114051696952559973034
"""
result = re.findall('photos/([\d]+)/albums/([\d]+)$', url)
if not result:
return ResultParseUrl(re.findall('.*/([\d]+)', url)[0], None)
return ResultParseUrl(*result[0])
@staticmethod
def file_is_exists(filepath):
if os.path.exists(filepath):
size = os.stat(filepath).st_size
if size:
return True
return False
def fetch(self):
"""main class function"""
self._gd_client = self._gd_auth()
if self.album_id:
self._add_album_fetch_task(self.album_id)
else:
self._fetch_all()
log.info("Finish fetching albums")
if self._album_fetch_queue is not None:
log.info("Waiting for fetch queue")
self._album_fetch_queue.join()
if self._download_queue is not None:
log.info("Waiting for download queue")
self._download_queue.join()
def _fetch_all(self):
albums = self._gd_client.GetUserFeed(user=self.user_id)
for album in albums.entry:
# TODO: fetch posts album as single album
# album.extension_elements[0].text = 'Buzz'
album_id = album.gphoto_id.text
self._add_album_fetch_task(album_id)
def _get_all_content_links(self, feed):
"""return: {filename: content_url}"""
return dict((self._strip(p.title.text), p.content.src) for p in feed.entry)
def main():
args = sys.argv[1:]
url = args[0]
email = None
password = None
save_dir = ALBUMS_SAVE_DIR
if len(args[1:]) >= 2:
email = args[1]
password = args[2]
if len(args[1:]) == 1:
save_dir = args[1]
elif len(args[1:]) == 3:
save_dir = args[3]
parsed_url = Fetcher.parse_album_url(url)
f = Fetcher(user_id=parsed_url.user_id, album_id=parsed_url.album_id,
email=email, password=password, save_dir=save_dir)
f.fetch()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment