koteq/album_fetcher.py

## album_fetcher.py
# -*- coding: utf-8 -*-
"""
Usage:
    python album_fetcher.py https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217
    python album_fetcher.py https://plus.google.com/118353143366443526186
    python album_fetcher.py https://plus.google.com/118353143366443526186 youremail@gmail.com yourpassword
    python album_fetcher.py https://plus.google.com/118353143366443526186 youremail@gmail.com yourpassword /out_dir/
TODO: use opt parse
"""
import os
import re
import sys
import urllib
import logging
import logging.handlers
from Queue import Queue
from threading import Thread
from collections import namedtuple

import gdata.photos.service

WINDOWS = os.name == 'nt'
if WINDOWS:
    from unidecode import unidecode

DownloadTask = namedtuple('DownloadTask', ['url', 'save_path'])
ResultParseUrl = namedtuple('ResultParseUrl', ['user_id', 'album_id'])

DOWNLOAD_THREADS_COUNT = 30
ALBUM_FETCHER_THREADS_COUNT = 10
DOWNLOAD_FULL_SIZED_IMAGES = True
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
ALBUMS_SAVE_DIR = os.path.join(PROJECT_ROOT, 'fetch_albums')

log = logging.getLogger('album_fetcher')
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S')


class Fetcher(object):
    def __init__(self, user_id, album_id, email=None, password=None, save_dir=None):
        self.user_id  = user_id
        self.album_id = album_id
        self.email = email
        self.password = password
        self.save_dir = save_dir or ALBUMS_SAVE_DIR

        self._gd_client = None
        self._download_queue = None
        self._album_fetch_queue = None

    def _strip(self, string):
        if WINDOWS:
            # transliterate utf string and strip illegal chars
            string = unidecode(string.decode('utf8'))
            string = re.sub(r'[\\/:"*?<>|]+', '', string)
            string = string.decode('latin-1', errors='ignore').encode('latin-1')
        return string.strip()

    def _download_worker(self):
        while True:
            task = self._download_queue.get()  # wait for item
            if 1:
                log.info('GET %s', task.url)
                try:
                    output_file = open(task.save_path, "wb")
                    input_data = urllib.urlopen(task.url)
                    output_file.write(input_data.read())
                    output_file.close()
                    input_data.close()
                except Exception:
                    log.exception()
                    pass
            self._download_queue.task_done()

    def _add_download_task(self, url, save_path):
        if self._download_queue is None:
            self._download_queue = Queue()
            for i in range(DOWNLOAD_THREADS_COUNT):
                t = Thread(target=self._download_worker)
                t.daemon = True
                t.start()
        self._download_queue.put(DownloadTask(url, save_path))

    def _album_fetch_worker(self):
        while True:
            album_id = self._album_fetch_queue.get()  # wait for item
            log.info("fetching album id: %s", album_id)
            feed = self._gd_client.GetFeed(
                    '/data/feed/api/user/%s/albumid/%s?kind=photo' % (self.user_id, album_id))

            out_dir = self._get_album_out_dir(feed)
            for filename, src_url in self._get_all_content_links(feed).iteritems():
                if WINDOWS:
                    filename, ext = os.path.splitext(filename)
                    filename = filename[:32] + ext  # TODO: where is can be same file names
                filepath = os.path.join(out_dir, filename)
                if self.file_is_exists(filepath):
                    continue
                if DOWNLOAD_FULL_SIZED_IMAGES:
                    src_url = re.sub(r'(.*)/', r'\1/s0-d/', src_url)
                self._add_download_task(src_url, filepath)
            self._album_fetch_queue.task_done()

    def _add_album_fetch_task(self, album_id):
        if self._album_fetch_queue is None:
            self._album_fetch_queue = Queue()
            for i in range(ALBUM_FETCHER_THREADS_COUNT):
                t = Thread(target=self._album_fetch_worker)
                t.daemon = True
                t.start()
        self._album_fetch_queue.put(album_id)

    def _gd_auth(self):
        if self._gd_client is not None:
            return self._gd_client
        self._gd_client = gdata.photos.service.PhotosService()
        if self.email and self.password:
            self._gd_client.email = self.email
            self._gd_client.password = self.password
            self._gd_client.ProgrammaticLogin()
        return self._gd_client

    def _get_album_out_dir(self, feed):
        """
        Create non exist directories and return album save path
        """
        user_name = self._strip(feed.nickname.text)
        album_name = None
        try:
            # this hack allow us to group many dummy albums from stream into one
            for extension_element in feed.extension_elements:
                if extension_element.tag == 'albumType':
                    album_name = extension_element.text
        except AttributeError:
            pass
        if album_name is None:
            album_name = self._strip(feed.title.text)
        out_dir = os.path.join(self.save_dir, user_name, album_name)
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        # TODO: user can rename himself
        touch_file = os.path.join(self.save_dir, user_name, self.user_id)
        try:
            if not os.path.exists(touch_file):
                file(touch_file, 'a').close()
            os.utime(touch_file, None)
        except IOError:
            pass
        return out_dir

    @staticmethod
    def parse_album_url(url):
        """
        https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217
        https://plus.google.com/114051696952559973034
        """
        result = re.findall('photos/([\d]+)/albums/([\d]+)$', url)
        if not result:
            return ResultParseUrl(re.findall('.*/([\d]+)', url)[0], None)
        return ResultParseUrl(*result[0])

    @staticmethod
    def file_is_exists(filepath):
        if os.path.exists(filepath):
            size = os.stat(filepath).st_size
            if size:
                return True
        return False

    def fetch(self):
        """main class function"""
        self._gd_client = self._gd_auth()
        if self.album_id:
            self._add_album_fetch_task(self.album_id)
        else:
            self._fetch_all()
        log.info("Finish fetching albums")
        if self._album_fetch_queue is not None:
            log.info("Waiting for fetch queue")
            self._album_fetch_queue.join()
        if self._download_queue is not None:
            log.info("Waiting for download queue")
            self._download_queue.join()

    def _fetch_all(self):
        albums = self._gd_client.GetUserFeed(user=self.user_id)
        for album in albums.entry:
            # TODO: fetch posts album as single album
            # album.extension_elements[0].text = 'Buzz'
            album_id = album.gphoto_id.text
            self._add_album_fetch_task(album_id)

    def _get_all_content_links(self, feed):
        """return: {filename: content_url}"""
        return dict((self._strip(p.title.text), p.content.src) for p in feed.entry)


def main():
    args = sys.argv[1:]
    url = args[0]
    email = None
    password = None
    save_dir = ALBUMS_SAVE_DIR
    if len(args[1:]) >= 2:
        email = args[1]
        password = args[2]

    if len(args[1:]) == 1:
        save_dir = args[1]
    elif len(args[1:]) == 3:
        save_dir = args[3]

    parsed_url = Fetcher.parse_album_url(url)
    f = Fetcher(user_id=parsed_url.user_id, album_id=parsed_url.album_id,
            email=email, password=password, save_dir=save_dir)
    f.fetch()


if __name__ == '__main__':
    main()
	# -- coding: utf-8 --
	"""
	Usage:
	python album_fetcher.py https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217
	python album_fetcher.py https://plus.google.com/118353143366443526186
	python album_fetcher.py https://plus.google.com/118353143366443526186 youremail@gmail.com yourpassword
	python album_fetcher.py https://plus.google.com/118353143366443526186 youremail@gmail.com yourpassword /out_dir/
	TODO: use opt parse
	"""
	import os
	import re
	import sys
	import urllib
	import logging
	import logging.handlers
	from Queue import Queue
	from threading import Thread
	from collections import namedtuple

	import gdata.photos.service

	WINDOWS = os.name == 'nt'
	if WINDOWS:
	from unidecode import unidecode

	DownloadTask = namedtuple('DownloadTask', ['url', 'save_path'])
	ResultParseUrl = namedtuple('ResultParseUrl', ['user_id', 'album_id'])

	DOWNLOAD_THREADS_COUNT = 30
	ALBUM_FETCHER_THREADS_COUNT = 10
	DOWNLOAD_FULL_SIZED_IMAGES = True
	PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
	ALBUMS_SAVE_DIR = os.path.join(PROJECT_ROOT, 'fetch_albums')

	log = logging.getLogger('album_fetcher')
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s %(levelname)s %(message)s',
	datefmt='%H:%M:%S')


	class Fetcher(object):
	def __init__(self, user_id, album_id, email=None, password=None, save_dir=None):
	self.user_id = user_id
	self.album_id = album_id
	self.email = email
	self.password = password
	self.save_dir = save_dir or ALBUMS_SAVE_DIR

	self._gd_client = None
	self._download_queue = None
	self._album_fetch_queue = None

	def _strip(self, string):
	if WINDOWS:
	# transliterate utf string and strip illegal chars
	string = unidecode(string.decode('utf8'))
	string = re.sub(r'[\\/:"*?<>\|]+', '', string)
	string = string.decode('latin-1', errors='ignore').encode('latin-1')
	return string.strip()

	def _download_worker(self):
	while True:
	task = self._download_queue.get() # wait for item
	if 1:
	log.info('GET %s', task.url)
	try:
	output_file = open(task.save_path, "wb")
	input_data = urllib.urlopen(task.url)
	output_file.write(input_data.read())
	output_file.close()
	input_data.close()
	except Exception:
	log.exception()
	pass
	self._download_queue.task_done()

	def _add_download_task(self, url, save_path):
	if self._download_queue is None:
	self._download_queue = Queue()
	for i in range(DOWNLOAD_THREADS_COUNT):
	t = Thread(target=self._download_worker)
	t.daemon = True
	t.start()
	self._download_queue.put(DownloadTask(url, save_path))

	def _album_fetch_worker(self):
	while True:
	album_id = self._album_fetch_queue.get() # wait for item
	log.info("fetching album id: %s", album_id)
	feed = self._gd_client.GetFeed(
	'/data/feed/api/user/%s/albumid/%s?kind=photo' % (self.user_id, album_id))

	out_dir = self._get_album_out_dir(feed)
	for filename, src_url in self._get_all_content_links(feed).iteritems():
	if WINDOWS:
	filename, ext = os.path.splitext(filename)
	filename = filename[:32] + ext # TODO: where is can be same file names
	filepath = os.path.join(out_dir, filename)
	if self.file_is_exists(filepath):
	continue
	if DOWNLOAD_FULL_SIZED_IMAGES:
	src_url = re.sub(r'(.*)/', r'\1/s0-d/', src_url)
	self._add_download_task(src_url, filepath)
	self._album_fetch_queue.task_done()

	def _add_album_fetch_task(self, album_id):
	if self._album_fetch_queue is None:
	self._album_fetch_queue = Queue()
	for i in range(ALBUM_FETCHER_THREADS_COUNT):
	t = Thread(target=self._album_fetch_worker)
	t.daemon = True
	t.start()
	self._album_fetch_queue.put(album_id)

	def _gd_auth(self):
	if self._gd_client is not None:
	return self._gd_client
	self._gd_client = gdata.photos.service.PhotosService()
	if self.email and self.password:
	self._gd_client.email = self.email
	self._gd_client.password = self.password
	self._gd_client.ProgrammaticLogin()
	return self._gd_client

	def _get_album_out_dir(self, feed):
	"""
	Create non exist directories and return album save path
	"""
	user_name = self._strip(feed.nickname.text)
	album_name = None
	try:
	# this hack allow us to group many dummy albums from stream into one
	for extension_element in feed.extension_elements:
	if extension_element.tag == 'albumType':
	album_name = extension_element.text
	except AttributeError:
	pass
	if album_name is None:
	album_name = self._strip(feed.title.text)
	out_dir = os.path.join(self.save_dir, user_name, album_name)
	if not os.path.exists(out_dir):
	os.makedirs(out_dir)
	# TODO: user can rename himself
	touch_file = os.path.join(self.save_dir, user_name, self.user_id)
	try:
	if not os.path.exists(touch_file):
	file(touch_file, 'a').close()
	os.utime(touch_file, None)
	except IOError:
	pass
	return out_dir

	@staticmethod
	def parse_album_url(url):
	"""
	https://plus.google.com/photos/118353143366443526186/albums/5626152497309725217
	https://plus.google.com/114051696952559973034
	"""
	result = re.findall('photos/([\d]+)/albums/([\d]+)$', url)
	if not result:
	return ResultParseUrl(re.findall('.*/([\d]+)', url)[0], None)
	return ResultParseUrl(*result[0])

	@staticmethod
	def file_is_exists(filepath):
	if os.path.exists(filepath):
	size = os.stat(filepath).st_size
	if size:
	return True
	return False

	def fetch(self):
	"""main class function"""
	self._gd_client = self._gd_auth()
	if self.album_id:
	self._add_album_fetch_task(self.album_id)
	else:
	self._fetch_all()
	log.info("Finish fetching albums")
	if self._album_fetch_queue is not None:
	log.info("Waiting for fetch queue")
	self._album_fetch_queue.join()
	if self._download_queue is not None:
	log.info("Waiting for download queue")
	self._download_queue.join()

	def _fetch_all(self):
	albums = self._gd_client.GetUserFeed(user=self.user_id)
	for album in albums.entry:
	# TODO: fetch posts album as single album
	# album.extension_elements[0].text = 'Buzz'
	album_id = album.gphoto_id.text
	self._add_album_fetch_task(album_id)

	def _get_all_content_links(self, feed):
	"""return: {filename: content_url}"""
	return dict((self._strip(p.title.text), p.content.src) for p in feed.entry)


	def main():
	args = sys.argv[1:]
	url = args[0]
	email = None
	password = None
	save_dir = ALBUMS_SAVE_DIR
	if len(args[1:]) >= 2:
	email = args[1]
	password = args[2]

	if len(args[1:]) == 1:
	save_dir = args[1]
	elif len(args[1:]) == 3:
	save_dir = args[3]

	parsed_url = Fetcher.parse_album_url(url)
	f = Fetcher(user_id=parsed_url.user_id, album_id=parsed_url.album_id,
	email=email, password=password, save_dir=save_dir)
	f.fetch()


	if __name__ == '__main__':
	main()