Skip to content

Instantly share code, notes, and snippets.

@rbhatia46
Forked from zmwangx/fb-dl.py
Created July 11, 2017 14:40
Show Gist options
  • Save rbhatia46/3035ad79ed349602612719151746f31f to your computer and use it in GitHub Desktop.
Save rbhatia46/3035ad79ed349602612719151746f31f to your computer and use it in GitHub Desktop.
Scrape all photos from a public Facebook page.
#!/usr/bin/env python
############################### README ###############################
# External dependencies:
# * libmagic
# * python-dateutil
# * python-magic
# * requests
#
# The shell command call at the end was tested with GNU date from GNU
# coreutils. Other implementations might not work (for instance, BSD
# date is not compatible). However, you may comment out the last
# section without affecting the download.
#
# The "last_update_timestamp" is reserved for the update feature,
# which is not implemented yet.
######################################################################
import commands
import json
import os
import sys
import urllib
import dateutil.parser
import magic
import requests
########################### CUSTOMIZE THIS ###########################
page_id = "567045406742962"
access_token = "app_id|app_secret" # see https://developers.facebook.com/docs/facebook-login/access-tokens#apptokens
dest = os.path.expanduser("~/img/sns/apink-official-facebook")
website_title = "apink-official-facebook"
######################################################################
if not os.path.exists(dest):
os.makedirs(dest)
# read last update time, if it is available
last_update_record = dest + "/last_update_timestamp"
if os.path.exists(last_update_record):
f = open(last_update_record, "r")
last_update_timestamp = f.readline()
f.close()
last_update_time = dateutil.parser.parse(last_update_timestamp)
else:
last_update_time = dateutil.parser.parse("1970-01-01T00:00+00:00")
# this function makes an API call with only an access_token (which
# could be just app-id|app-secret)
def fb_public_call(endpoint, params, access_token):
params["access_token"] = access_token
response = requests.get("https://graph.facebook.com/" + endpoint,
params=params)
return response.json()
# this function downloads a photo
# return codes are defined below
SUCCESS = 0
FAILED_DOWNLOAD = 1
UNRECOGNIZED_MIME = 2
OLD_PHOTO = 255 # photo older than last update time
def handle_photo(photo, album_id):
# print information
photo_id = photo["id"]
time = dateutil.parser.parse(photo["created_time"])
if time < last_update_time:
return OLD_PHOTO
time_print = time.strftime("%b %d, %Y")
time_full = time.strftime("%Y%m%d%H%M%S")
original_image = photo["images"][0]
height = original_image["height"]
width = original_image["width"]
format_string = "date: %s id: %s size: %sx%s"
print format_string % (time_print, photo_id, width,
height)
# download file
source_uri = original_image["source"]
filename = time_full + "-" + website_title + "-" + \
album_id + "-" + photo_id
filepath = dest + "/" + filename
urllib.urlretrieve(source_uri, filepath)
# identify mime type and attach extension
if os.path.exists(filepath):
mime = magic.from_file(filepath, mime=True)
if mime == "image/gif":
newfilepath = filepath + ".gif"
elif mime == "image/jpeg":
newfilepath = filepath + ".jpg"
elif mime == "image/png":
newfilepath = filepath + ".png"
else:
err = filepath + ": error: " + \
"unrecgonized image type\n"
sys.stderr.write(err)
return UNRECOGNIZED_MIME
os.rename(filepath, newfilepath)
return SUCCESS
else:
# donwload failed for whatever reason
err = "error: " + filename + " failed to " + \
"downloaded from " + source_uri + "\n"
sys.stderr.write(err)
return FAILED_DOWNLOAD
# this function handles an album, i.e., download newly added photos
# since the last update
def handle_album(album):
# print album info
album_id = album["id"]
format_string = "downloading album \"%s\" " + \
"(album id: %s; photo count: %s)"
print format_string % (album["name"], album_id,
album["count"])
print "-" * 80
# retrieve photos in the album
photos_response = fb_public_call(album["id"] + "/photos",
params, access_token)
while True:
for photo in photos_response["data"]:
if handle_photo(photo, album_id) == OLD_PHOTO:
# already encountered old photo in this album
# no need to look further into the past
print
return
if "next" in photos_response["paging"]:
next_uri = photos_response["paging"]["next"]
photos_response = requests.get(next_uri).json()
else:
break
print
params = {}
# retrieve albums
albums_response = fb_public_call(page_id + "/albums", params,
access_token);
while True:
for album in albums_response["data"]:
handle_album(album)
if "next" in albums_response["paging"]:
next_uri = albums_response["paging"]["next"]
albums_response = requests.get(next_uri).json()
else:
break
# update feature yet to be implemented
# create a file "last_update_timestamp" for future use
f = open(last_update_record, "w")
f.write(commands.getoutput("date -u --iso-8601=seconds"))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment