jaimergp/fotologbackup.py

## fotologbackup.py
#!/usr/bin/env python
# encoding: utf-8

"""
Download your entire Fotolog to disk, comments included
@jaimergp, 2016

Dependencies: requests, beautifulsoup4
"""

# Python
import os
import json
import sys
import time
# Dependencies
import requests
from bs4 import BeautifulSoup

session = requests.Session()
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=5))


class FotologClient():

    """
    Create an API client for given username
    """

    def __init__(self, username):
        self.username = username
        self.profile_url = 'http://www.fotolog.com/{}'.format(username)
        self.mosaic_url = '{}/mosaic'.format(self.profile_url)
        self.profile_bio = self.bio()

    def bio(self):
        """
        Parses `/mosaic` page to retrieve the profile stats and bio

        Returns
        -------
        data : dict
            Metadata of the user, including stats (number of posts, views,
            number of friends, groups, flashs), avatar, gender, marital status,
            birthday, register date, user location, profile description.
        """
        try:
            r = session.get(self.mosaic_url)
            r.raise_for_status()
        except requests.exceptions.HTTPError as e:
            sys.exit("HTTP Error. Please try again!\n{}".format(e))

        soup = BeautifulSoup(r.content, 'html.parser')

        data = {}
        # Stats
        stats_bar = soup.find('ul', attrs={'id': 'profile_bar'})
        categories = ['posts_number', 'views', 'friends_number', 'groups', 'flashs']
        for li, category in zip(stats_bar, categories):
            num, tag = li.text.split('\n')
            data[category] = num

        # Bio
        bio_col = soup.find('div', attrs={'id': 'wall_infos_profile'})
        bio_p = bio_col.find_all('p')
        avatar_url = bio_col.find('img', attrs={'alt': self.username}).get('src')
        personal, member_since = bio_p[0].text.split('\n')
        member_since = member_since.split()[-1]
        gender, marital, birthday = personal.split(' - ')
        location = bio_p[1].text.strip()
        description = '\n'.join([p.text for p in bio_p[2:]])

        data.update({'avatar_url': avatar_url,
                     'gender': gender,
                     'marital': marital,
                     'birthday': birthday,
                     'member_since': member_since,
                     'location': location,
                     'description': description
                     })

        return data

    def all_links(self, url=None):
        """
        Scrapes /mosaic to retrieve all links to published photos.

        Yields
        ------
        href : str
            The link to each post
        """
        if url is None:
            url = self.mosaic_url

        r = session.get(url)
        r.raise_for_status()

        soup = BeautifulSoup(r.content, 'html.parser')

        links = soup.find_all('a', {'class': 'wall_img_container'})
        for a in links:
            yield a.get('href')

        # Go to next page
        navigation = soup.find('div', {'id': 'pagination'}).find_all('a')
        for page in navigation:
            if page.text in ('>', '&gt;'):
                yield from self.all_links(page.get('href'))

    def all_posts(self, resume_url=''):
        """
        Iterates over Fotolog profile, from newer to older, building dict
        of each post: image url, comments, date, views.

        Parameters
        ----------
        resume_url : str
            Starting URL. User frontpage by default.

        Yields
        ------
        post : dict
            Dict metadata of each post
        """
        total = self.profile_bio['posts_number'] if not resume_url else '???'
        print('Scraping', total, 'photos for user', self.username)
        url = "{}/{}".format(self.profile_url, resume_url)
        i = 1
        while url:
            try:
                print('Getting {}/{}... [{}]            '.format(i, total, url), end='\r')
                post = self.post(url)
            except (requests.exceptions.HTTPError, AttributeError):
                print('Getting {}/{}... [{}] Retrying...'.format(i, total, url), end='\r')
                time.sleep(1.0)
                continue
            except:
                raise StopIteration
            else:
                url = post['next']
                i += 1
                yield post
        print('\nDone!')

    def download(self, path=None, resume_url=None):
        """
        Download everything (images and metadata) to disk

        Parameters
        ----------
        path : str
            Base location of files. <user>/ by default.
        resume_url : str
            Starting url to scrape. User frontpage by default. It
            will iterate from newer to older.
        """
        if path is None:
            path = self.username
        self.mkdir(path)

        posts = []
        for post in self.all_posts(resume_url=resume_url):
            self.download_image(post['image'], basedir=path)
            posts.append(post)
        self.download_image(self.profile_bio['avatar_url'], basedir=path)

        data = {'bio': self.profile_bio, 'posts': posts}
        json_path = os.path.join(path, '{}.json'.format(self.username))
        with open(json_path, 'w+') as f:
            json.dump(data, f, ensure_ascii=False)

    def download_images(self, path=None):
        """
        Download all images of the profile, with no metadata

        Parameters
        ----------
        path : str
            Base location of files. <user>/img by default.
        """
        if path is None:
            path = self.username
        self.mkdir(path)

        for post in self.all_posts():
            img_path = os.path.join(path, 'img')
            self.download_image(post['image'], basedir=img_path)

    def download_metadata(self, path=None):
        """
        Dump each post metadata to JSON

        Parameters
        ----------
        path : str
            Name of dumped JSON file. <user>/<user>.json by default.
        """
        if path is None:
            path = '{0}/{0}.json'.format(self.username)
        bio = self.bio()
        d = {'bio': bio, 'posts': list(self.all_posts())}
        with open(path, 'w+') as f:
            json.dump(d, f, ensure_ascii=False)

    @staticmethod
    def post(url):
        """
        Scrapes a post url to obtain image url, description, comments, date, views.

        Parameters
        ----------
        url : str
            URL of desired post

        Returns
        -------
        data : dict
            Dict with all metadata
        """
        r = session.get(url)
        r.raise_for_status()

        soup = BeautifulSoup(r.content, 'html.parser')

        image = soup.find('div', {'id': 'flog_img_holder'}).find('img').get('src')
        description_photo = soup.find('div', {'id': 'description_photo'})
        title = getattr(description_photo.find('h1'), 'text', '')
        description_lines = getattr(description_photo.find('p'), 'text', '').split('\n')
        description = '\n'.join(description_lines[:-1])
        date_and_views = description_lines[-1].split()
        date = ' '.join(date_and_views[1:-2])
        views = date_and_views[-2]
        comments = list(FotologClient.parse_comments(soup))
        next_post = soup.find('a', {'class': 'arrow_change_photo_right'})
        next_url = next_post.get('href') if next_post else None

        return {'image': image,
                'title': title,
                'description': description,
                'date': date,
                'views': views,
                'comments': comments,
                'url': url,
                'next': next_url}

    @staticmethod
    def parse_comments(soup):
        """
        Parse the comment section to obtain text, user, and date.

        Parameters
        ----------
        soup : BeautifulSoup
            Parsed HTML of post page

        Yields
        ------
        data : dict
            User, date and text of each comment
        """
        wrapper = soup.find('div', {'id': 'list_all_comments'})
        divs = wrapper.find_all('div', {'class': 'flog_img_comments'})[1:]
        for div in divs:
            lines = [l.strip() for l in div.get_text('\n').split('\n')
                     if l and '<![CDATA[' not in l]
            user = lines.pop(0) if lines else ''
            date = lines.pop(0) if lines else ''
            text = '\n'.join(lines) if lines else ''
            yield {'user': user, 'date': date, 'text': text}

    @staticmethod
    def download_image(url, basedir=''):
        """
        Download an image url to disk

        Parameters
        ----------
        url : str
            URL of desired image

        basedir : str
            Base location of downloaded image. Working directory by default.
        """
        while url:
            try:
                r = session.get(url, stream=True)
                r.raise_for_status()
            except (requests.exceptions.HTTPError):
                time.sleep(1.0)
                continue
            else:
                path = os.path.join(basedir, url.split('/')[-1])
                with open(path, 'wb') as f:
                    for chunk in r:
                        f.write(chunk)
                url = None

    @staticmethod
    def mkdir(path):
        """
        Create directory if it doesn't exist
        """
        try:
            os.makedirs(path)
        except (OSError, IOError):
            if os.path.isfile(path):
                raise IOError('[!] Path {} is a file'.format(path))


if __name__ == '__main__':
    try:
        client = FotologClient(sys.argv[1])
    except IndexError:
        sys.exit('Usage: python fotologbackup.py <username>')
    else:
        resume = sys.argv[2] if sys.argv[2:3] else ''
        client.download(resume_url=resume)
	#!/usr/bin/env python
	# encoding: utf-8

	"""
	Download your entire Fotolog to disk, comments included
	@jaimergp, 2016

	Dependencies: requests, beautifulsoup4
	"""

	# Python
	import os
	import json
	import sys
	import time
	# Dependencies
	import requests
	from bs4 import BeautifulSoup

	session = requests.Session()
	session.mount("http://", requests.adapters.HTTPAdapter(max_retries=5))


	class FotologClient():

	"""
	Create an API client for given username
	"""

	def __init__(self, username):
	self.username = username
	self.profile_url = 'http://www.fotolog.com/{}'.format(username)
	self.mosaic_url = '{}/mosaic'.format(self.profile_url)
	self.profile_bio = self.bio()

	def bio(self):
	"""
	Parses `/mosaic` page to retrieve the profile stats and bio

	Returns
	-------
	data : dict
	Metadata of the user, including stats (number of posts, views,
	number of friends, groups, flashs), avatar, gender, marital status,
	birthday, register date, user location, profile description.
	"""
	try:
	r = session.get(self.mosaic_url)
	r.raise_for_status()
	except requests.exceptions.HTTPError as e:
	sys.exit("HTTP Error. Please try again!\n{}".format(e))

	soup = BeautifulSoup(r.content, 'html.parser')

	data = {}
	# Stats
	stats_bar = soup.find('ul', attrs={'id': 'profile_bar'})
	categories = ['posts_number', 'views', 'friends_number', 'groups', 'flashs']
	for li, category in zip(stats_bar, categories):
	num, tag = li.text.split('\n')
	data[category] = num

	# Bio
	bio_col = soup.find('div', attrs={'id': 'wall_infos_profile'})
	bio_p = bio_col.find_all('p')
	avatar_url = bio_col.find('img', attrs={'alt': self.username}).get('src')
	personal, member_since = bio_p[0].text.split('\n')
	member_since = member_since.split()[-1]
	gender, marital, birthday = personal.split(' - ')
	location = bio_p[1].text.strip()
	description = '\n'.join([p.text for p in bio_p[2:]])

	data.update({'avatar_url': avatar_url,
	'gender': gender,
	'marital': marital,
	'birthday': birthday,
	'member_since': member_since,
	'location': location,
	'description': description
	})

	return data

	def all_links(self, url=None):
	"""
	Scrapes /mosaic to retrieve all links to published photos.

	Yields
	------
	href : str
	The link to each post
	"""
	if url is None:
	url = self.mosaic_url

	r = session.get(url)
	r.raise_for_status()

	soup = BeautifulSoup(r.content, 'html.parser')

	links = soup.find_all('a', {'class': 'wall_img_container'})
	for a in links:
	yield a.get('href')

	# Go to next page
	navigation = soup.find('div', {'id': 'pagination'}).find_all('a')
	for page in navigation:
	if page.text in ('>', '>'):
	yield from self.all_links(page.get('href'))

	def all_posts(self, resume_url=''):
	"""
	Iterates over Fotolog profile, from newer to older, building dict
	of each post: image url, comments, date, views.

	Parameters
	----------
	resume_url : str
	Starting URL. User frontpage by default.

	Yields
	------
	post : dict
	Dict metadata of each post
	"""
	total = self.profile_bio['posts_number'] if not resume_url else '???'
	print('Scraping', total, 'photos for user', self.username)
	url = "{}/{}".format(self.profile_url, resume_url)
	i = 1
	while url:
	try:
	print('Getting {}/{}... [{}] '.format(i, total, url), end='\r')
	post = self.post(url)
	except (requests.exceptions.HTTPError, AttributeError):
	print('Getting {}/{}... [{}] Retrying...'.format(i, total, url), end='\r')
	time.sleep(1.0)
	continue
	except:
	raise StopIteration
	else:
	url = post['next']
	i += 1
	yield post
	print('\nDone!')

	def download(self, path=None, resume_url=None):
	"""
	Download everything (images and metadata) to disk

	Parameters
	----------
	path : str
	Base location of files. <user>/ by default.
	resume_url : str
	Starting url to scrape. User frontpage by default. It
	will iterate from newer to older.
	"""
	if path is None:
	path = self.username
	self.mkdir(path)

	posts = []
	for post in self.all_posts(resume_url=resume_url):
	self.download_image(post['image'], basedir=path)
	posts.append(post)
	self.download_image(self.profile_bio['avatar_url'], basedir=path)

	data = {'bio': self.profile_bio, 'posts': posts}
	json_path = os.path.join(path, '{}.json'.format(self.username))
	with open(json_path, 'w+') as f:
	json.dump(data, f, ensure_ascii=False)

	def download_images(self, path=None):
	"""
	Download all images of the profile, with no metadata

	Parameters
	----------
	path : str
	Base location of files. <user>/img by default.
	"""
	if path is None:
	path = self.username
	self.mkdir(path)

	for post in self.all_posts():
	img_path = os.path.join(path, 'img')
	self.download_image(post['image'], basedir=img_path)

	def download_metadata(self, path=None):
	"""
	Dump each post metadata to JSON

	Parameters
	----------
	path : str
	Name of dumped JSON file. <user>/<user>.json by default.
	"""
	if path is None:
	path = '{0}/{0}.json'.format(self.username)
	bio = self.bio()
	d = {'bio': bio, 'posts': list(self.all_posts())}
	with open(path, 'w+') as f:
	json.dump(d, f, ensure_ascii=False)

	@staticmethod
	def post(url):
	"""
	Scrapes a post url to obtain image url, description, comments, date, views.

	Parameters
	----------
	url : str
	URL of desired post

	Returns
	-------
	data : dict
	Dict with all metadata
	"""
	r = session.get(url)
	r.raise_for_status()

	soup = BeautifulSoup(r.content, 'html.parser')

	image = soup.find('div', {'id': 'flog_img_holder'}).find('img').get('src')
	description_photo = soup.find('div', {'id': 'description_photo'})
	title = getattr(description_photo.find('h1'), 'text', '')
	description_lines = getattr(description_photo.find('p'), 'text', '').split('\n')
	description = '\n'.join(description_lines[:-1])
	date_and_views = description_lines[-1].split()
	date = ' '.join(date_and_views[1:-2])
	views = date_and_views[-2]
	comments = list(FotologClient.parse_comments(soup))
	next_post = soup.find('a', {'class': 'arrow_change_photo_right'})
	next_url = next_post.get('href') if next_post else None

	return {'image': image,
	'title': title,
	'description': description,
	'date': date,
	'views': views,
	'comments': comments,
	'url': url,
	'next': next_url}

	@staticmethod
	def parse_comments(soup):
	"""
	Parse the comment section to obtain text, user, and date.

	Parameters
	----------
	soup : BeautifulSoup
	Parsed HTML of post page

	Yields
	------
	data : dict
	User, date and text of each comment
	"""
	wrapper = soup.find('div', {'id': 'list_all_comments'})
	divs = wrapper.find_all('div', {'class': 'flog_img_comments'})[1:]
	for div in divs:
	lines = [l.strip() for l in div.get_text('\n').split('\n')
	if l and '<![CDATA[' not in l]
	user = lines.pop(0) if lines else ''
	date = lines.pop(0) if lines else ''
	text = '\n'.join(lines) if lines else ''
	yield {'user': user, 'date': date, 'text': text}

	@staticmethod
	def download_image(url, basedir=''):
	"""
	Download an image url to disk

	Parameters
	----------
	url : str
	URL of desired image

	basedir : str
	Base location of downloaded image. Working directory by default.
	"""
	while url:
	try:
	r = session.get(url, stream=True)
	r.raise_for_status()
	except (requests.exceptions.HTTPError):
	time.sleep(1.0)
	continue
	else:
	path = os.path.join(basedir, url.split('/')[-1])
	with open(path, 'wb') as f:
	for chunk in r:
	f.write(chunk)
	url = None

	@staticmethod
	def mkdir(path):
	"""
	Create directory if it doesn't exist
	"""
	try:
	os.makedirs(path)
	except (OSError, IOError):
	if os.path.isfile(path):
	raise IOError('[!] Path {} is a file'.format(path))


	if __name__ == '__main__':
	try:
	client = FotologClient(sys.argv[1])
	except IndexError:
	sys.exit('Usage: python fotologbackup.py <username>')
	else:
	resume = sys.argv[2] if sys.argv[2:3] else ''
	client.download(resume_url=resume)