Skip to content

Instantly share code, notes, and snippets.

@jaimergp
Last active April 14, 2021 04:09
Show Gist options
  • Save jaimergp/b6cd9970a48afe948649 to your computer and use it in GitHub Desktop.
Save jaimergp/b6cd9970a48afe948649 to your computer and use it in GitHub Desktop.
Download your entire Fotolog to disk, comments included [DEPRECATED]
#!/usr/bin/env python
# encoding: utf-8
"""
Download your entire Fotolog to disk, comments included
@jaimergp, 2016
Dependencies: requests, beautifulsoup4
"""
# Python
import os
import json
import sys
import time
# Dependencies
import requests
from bs4 import BeautifulSoup
session = requests.Session()
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=5))
class FotologClient():
"""
Create an API client for given username
"""
def __init__(self, username):
self.username = username
self.profile_url = 'http://www.fotolog.com/{}'.format(username)
self.mosaic_url = '{}/mosaic'.format(self.profile_url)
self.profile_bio = self.bio()
def bio(self):
"""
Parses `/mosaic` page to retrieve the profile stats and bio
Returns
-------
data : dict
Metadata of the user, including stats (number of posts, views,
number of friends, groups, flashs), avatar, gender, marital status,
birthday, register date, user location, profile description.
"""
try:
r = session.get(self.mosaic_url)
r.raise_for_status()
except requests.exceptions.HTTPError as e:
sys.exit("HTTP Error. Please try again!\n{}".format(e))
soup = BeautifulSoup(r.content, 'html.parser')
data = {}
# Stats
stats_bar = soup.find('ul', attrs={'id': 'profile_bar'})
categories = ['posts_number', 'views', 'friends_number', 'groups', 'flashs']
for li, category in zip(stats_bar, categories):
num, tag = li.text.split('\n')
data[category] = num
# Bio
bio_col = soup.find('div', attrs={'id': 'wall_infos_profile'})
bio_p = bio_col.find_all('p')
avatar_url = bio_col.find('img', attrs={'alt': self.username}).get('src')
personal, member_since = bio_p[0].text.split('\n')
member_since = member_since.split()[-1]
gender, marital, birthday = personal.split(' - ')
location = bio_p[1].text.strip()
description = '\n'.join([p.text for p in bio_p[2:]])
data.update({'avatar_url': avatar_url,
'gender': gender,
'marital': marital,
'birthday': birthday,
'member_since': member_since,
'location': location,
'description': description
})
return data
def all_links(self, url=None):
"""
Scrapes /mosaic to retrieve all links to published photos.
Yields
------
href : str
The link to each post
"""
if url is None:
url = self.mosaic_url
r = session.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
links = soup.find_all('a', {'class': 'wall_img_container'})
for a in links:
yield a.get('href')
# Go to next page
navigation = soup.find('div', {'id': 'pagination'}).find_all('a')
for page in navigation:
if page.text in ('>', '>'):
yield from self.all_links(page.get('href'))
def all_posts(self, resume_url=''):
"""
Iterates over Fotolog profile, from newer to older, building dict
of each post: image url, comments, date, views.
Parameters
----------
resume_url : str
Starting URL. User frontpage by default.
Yields
------
post : dict
Dict metadata of each post
"""
total = self.profile_bio['posts_number'] if not resume_url else '???'
print('Scraping', total, 'photos for user', self.username)
url = "{}/{}".format(self.profile_url, resume_url)
i = 1
while url:
try:
print('Getting {}/{}... [{}] '.format(i, total, url), end='\r')
post = self.post(url)
except (requests.exceptions.HTTPError, AttributeError):
print('Getting {}/{}... [{}] Retrying...'.format(i, total, url), end='\r')
time.sleep(1.0)
continue
except:
raise StopIteration
else:
url = post['next']
i += 1
yield post
print('\nDone!')
def download(self, path=None, resume_url=None):
"""
Download everything (images and metadata) to disk
Parameters
----------
path : str
Base location of files. <user>/ by default.
resume_url : str
Starting url to scrape. User frontpage by default. It
will iterate from newer to older.
"""
if path is None:
path = self.username
self.mkdir(path)
posts = []
for post in self.all_posts(resume_url=resume_url):
self.download_image(post['image'], basedir=path)
posts.append(post)
self.download_image(self.profile_bio['avatar_url'], basedir=path)
data = {'bio': self.profile_bio, 'posts': posts}
json_path = os.path.join(path, '{}.json'.format(self.username))
with open(json_path, 'w+') as f:
json.dump(data, f, ensure_ascii=False)
def download_images(self, path=None):
"""
Download all images of the profile, with no metadata
Parameters
----------
path : str
Base location of files. <user>/img by default.
"""
if path is None:
path = self.username
self.mkdir(path)
for post in self.all_posts():
img_path = os.path.join(path, 'img')
self.download_image(post['image'], basedir=img_path)
def download_metadata(self, path=None):
"""
Dump each post metadata to JSON
Parameters
----------
path : str
Name of dumped JSON file. <user>/<user>.json by default.
"""
if path is None:
path = '{0}/{0}.json'.format(self.username)
bio = self.bio()
d = {'bio': bio, 'posts': list(self.all_posts())}
with open(path, 'w+') as f:
json.dump(d, f, ensure_ascii=False)
@staticmethod
def post(url):
"""
Scrapes a post url to obtain image url, description, comments, date, views.
Parameters
----------
url : str
URL of desired post
Returns
-------
data : dict
Dict with all metadata
"""
r = session.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
image = soup.find('div', {'id': 'flog_img_holder'}).find('img').get('src')
description_photo = soup.find('div', {'id': 'description_photo'})
title = getattr(description_photo.find('h1'), 'text', '')
description_lines = getattr(description_photo.find('p'), 'text', '').split('\n')
description = '\n'.join(description_lines[:-1])
date_and_views = description_lines[-1].split()
date = ' '.join(date_and_views[1:-2])
views = date_and_views[-2]
comments = list(FotologClient.parse_comments(soup))
next_post = soup.find('a', {'class': 'arrow_change_photo_right'})
next_url = next_post.get('href') if next_post else None
return {'image': image,
'title': title,
'description': description,
'date': date,
'views': views,
'comments': comments,
'url': url,
'next': next_url}
@staticmethod
def parse_comments(soup):
"""
Parse the comment section to obtain text, user, and date.
Parameters
----------
soup : BeautifulSoup
Parsed HTML of post page
Yields
------
data : dict
User, date and text of each comment
"""
wrapper = soup.find('div', {'id': 'list_all_comments'})
divs = wrapper.find_all('div', {'class': 'flog_img_comments'})[1:]
for div in divs:
lines = [l.strip() for l in div.get_text('\n').split('\n')
if l and '<![CDATA[' not in l]
user = lines.pop(0) if lines else ''
date = lines.pop(0) if lines else ''
text = '\n'.join(lines) if lines else ''
yield {'user': user, 'date': date, 'text': text}
@staticmethod
def download_image(url, basedir=''):
"""
Download an image url to disk
Parameters
----------
url : str
URL of desired image
basedir : str
Base location of downloaded image. Working directory by default.
"""
while url:
try:
r = session.get(url, stream=True)
r.raise_for_status()
except (requests.exceptions.HTTPError):
time.sleep(1.0)
continue
else:
path = os.path.join(basedir, url.split('/')[-1])
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
url = None
@staticmethod
def mkdir(path):
"""
Create directory if it doesn't exist
"""
try:
os.makedirs(path)
except (OSError, IOError):
if os.path.isfile(path):
raise IOError('[!] Path {} is a file'.format(path))
if __name__ == '__main__':
try:
client = FotologClient(sys.argv[1])
except IndexError:
sys.exit('Usage: python fotologbackup.py <username>')
else:
resume = sys.argv[2] if sys.argv[2:3] else ''
client.download(resume_url=resume)
@oddgoo
Copy link

oddgoo commented Jan 18, 2019

Fotolog has recently updated their whole website and removed comments, this won't work any more unfortunately

@jaimergp
Copy link
Author

@oddgoo, thanks for the heads up!

@NicolasLisperguier
Copy link

Hola Jaime. Tú sabes como recuperar mi Fotolog? aunque sea el texto?

Gracias

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment