david-crespo/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Download photos and videos you're tagged in on Facebook

Why

When you download an archive of your Facebook account, Facebook includes photos and videos you've uploaded, but not photos and videos you're tagged in that were uploaded by other people. This is a script to automatically download those.
Setup

This requires Python 3.

Make sure you have curl (Linux and Mac likely already have it)
mkdir photos videos in the same directory as the script
pip3 install selenium
Download the ChromeDriver executable and put it somewhere in your PATH
Set FB_USER_ID and CHROME_PROFILE_PATH in helpers.py
Set CONTAINER_SELECTOR (see below)

Authentication

The trick here is to avoid having to log in from the script by using the same Chrome profile every time the script runs. Run python3 tagged_photos.py, a Chrome window will open, and you will be redirected to FB login. Once you log in, your login will persist, so you can close the window and run the script again, and it should work.
CONTAINER_SELECTOR

The photo downloader relies on a particular class that is likely to change over time because it's auto-generated by FB's frontend build process. It was .atb when I wrote this but it'll probably change all the time. You'll have to dig into the source of the photo page to figure out what the right class is.
Running it

python3 tagged_photos.py or python3 tagged_videos.py
The photos can take a while if you have a lot because it is navigating through the site in real time and I didn't figure out how to parallelize it because this is Python (would have been easy in JS). For about 900 photos it took almost an hour.

  
## helpers.py
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

FB_USER_ID = '' # SET ME

# on mac, probably /Users/<mac username>/Library/Application Support/Google/Chrome/Default
CHROME_PROFILE_PATH = ""


def get_driver():
    wd_options = Options()
    wd_options.add_argument("--disable-notifications")
    wd_options.add_argument("--disable-infobars")
    wd_options.add_argument("--mute-audio")
    wd_options.add_argument("--start-maximized")
    wd_options.add_argument("--user-data-dir={}".format(CHROME_PROFILE_PATH))

    return webdriver.Chrome(chrome_options=wd_options)


def scroll_to_bottom(driver):
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(1)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

## tagged_photos.py
import json, re

from datetime import datetime, timezone
from subprocess import call

from helpers import scroll_to_bottom, get_driver, FB_USER_ID

# you will likely need to update this to something that selects
# for the container around the photo info, timestamp, album, etc
CONTAINER_SELECTOR = ".atb"


def get_fb_id(link):
    match = re.search("fbid=([0-9]+)", link)
    if match:
        return match.group(1)

    return "fake_id_" + str(hash(link))

if __name__ == '__main__':
    print("-"*20 + "\nOpening Browser...")

    driver = get_driver()
    driver.get("https://m.facebook.com/{}/photos".format(FB_USER_ID))
    scroll_to_bottom(driver)

    photo_links = list(map(
        lambda el: el.get_attribute("href"),
        driver.find_elements_by_css_selector('.timeline.photos a')
    ))

    pretty = dict(sort_keys=True, indent=4, separators=(',', ': '))

    photos = []
    for link in photo_links:
        driver.get(link)

        photo_id = get_fb_id(link)
        full_size_url = driver.find_element_by_link_text("View Full Size").get_attribute("href")
        actor = driver.find_element_by_css_selector('.actor').text
        people = list(map(
            lambda el: el.text,
            driver.find_elements_by_css_selector('.tagName')
        ))
        caption = driver.find_element_by_css_selector('.msg > div').text
        timestamp_json = driver.find_element_by_css_selector('{} abbr'.format(CONTAINER_SELECTOR)).get_attribute('data-store')
        timestamp = json.loads(timestamp_json).get("time")
        info = driver.find_element_by_css_selector('{} > div'.format(CONTAINER_SELECTOR)).text.replace('\u00b7', '-').rstrip(' -')
        date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
        filename = "{}_{}.jpg".format(date, photo_id)

        driver.get(full_size_url)
        photo = {
            "fb_url": link,
            "cdn_url": driver.current_url,
            "actor": actor,
            "caption": caption,
            "timestamp": timestamp,
            "info": info,
            "filename": filename,
            "people": people
        }
        print(json.dumps(photo, **pretty))
        photos.append(photo)

        with open('photos/data.json', 'w') as f:
            f.write(
                json.dumps(photos, **pretty)
            )

        call(["curl", driver.current_url, "--output", "photos/{}".format(filename)])

## tagged_videos.py
import re

from subprocess import call

from helpers import scroll_to_bottom, get_driver, FB_USER_ID

if __name__ == '__main__':
    print("-"*20 + "\nOpening Browser...")

    driver = get_driver()
    driver.get("https://www.facebook.com/{}/videos".format(FB_USER_ID))
    scroll_to_bottom(driver)

    video_links = list(map(
        lambda el: el.get_attribute("href").replace('www.', 'm.'),
        driver.find_elements_by_css_selector('ul.fbStarGrid > li > a')
    ))

    for link in video_links:
        driver.get(link)

        page_source = driver.page_source

        driver.find_element_by_css_selector('[data-sigil="m-video-play-button playInlineVideo"]').click() # play video
        cdn_url = driver.find_element_by_css_selector('video').get_attribute('src')

        filename = cdn_url.split('?')[0].split('/')[-1]

        with open('videos/{}.html'.format(filename), 'w') as f:
            f.write(page_source)

        call(["curl", cdn_url, "--output", "videos/{}".format(filename)])
	import time
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options

	FB_USER_ID = '' # SET ME

	# on mac, probably /Users/<mac username>/Library/Application Support/Google/Chrome/Default
	CHROME_PROFILE_PATH = ""


	def get_driver():
	wd_options = Options()
	wd_options.add_argument("--disable-notifications")
	wd_options.add_argument("--disable-infobars")
	wd_options.add_argument("--mute-audio")
	wd_options.add_argument("--start-maximized")
	wd_options.add_argument("--user-data-dir={}".format(CHROME_PROFILE_PATH))

	return webdriver.Chrome(chrome_options=wd_options)


	def scroll_to_bottom(driver):
	# Get scroll height
	last_height = driver.execute_script("return document.body.scrollHeight")

	while True:
	# Scroll down to bottom
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

	# Wait to load page
	time.sleep(1)

	# Calculate new scroll height and compare with last scroll height
	new_height = driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height
	import json, re

	from datetime import datetime, timezone
	from subprocess import call

	from helpers import scroll_to_bottom, get_driver, FB_USER_ID

	# you will likely need to update this to something that selects
	# for the container around the photo info, timestamp, album, etc
	CONTAINER_SELECTOR = ".atb"


	def get_fb_id(link):
	match = re.search("fbid=([0-9]+)", link)
	if match:
	return match.group(1)

	return "fake_id_" + str(hash(link))

	if __name__ == '__main__':
	print("-"*20 + "\nOpening Browser...")

	driver = get_driver()
	driver.get("https://m.facebook.com/{}/photos".format(FB_USER_ID))
	scroll_to_bottom(driver)

	photo_links = list(map(
	lambda el: el.get_attribute("href"),
	driver.find_elements_by_css_selector('.timeline.photos a')
	))

	pretty = dict(sort_keys=True, indent=4, separators=(',', ': '))

	photos = []
	for link in photo_links:
	driver.get(link)

	photo_id = get_fb_id(link)
	full_size_url = driver.find_element_by_link_text("View Full Size").get_attribute("href")
	actor = driver.find_element_by_css_selector('.actor').text
	people = list(map(
	lambda el: el.text,
	driver.find_elements_by_css_selector('.tagName')
	))
	caption = driver.find_element_by_css_selector('.msg > div').text
	timestamp_json = driver.find_element_by_css_selector('{} abbr'.format(CONTAINER_SELECTOR)).get_attribute('data-store')
	timestamp = json.loads(timestamp_json).get("time")
	info = driver.find_element_by_css_selector('{} > div'.format(CONTAINER_SELECTOR)).text.replace('\u00b7', '-').rstrip(' -')
	date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
	filename = "{}_{}.jpg".format(date, photo_id)

	driver.get(full_size_url)
	photo = {
	"fb_url": link,
	"cdn_url": driver.current_url,
	"actor": actor,
	"caption": caption,
	"timestamp": timestamp,
	"info": info,
	"filename": filename,
	"people": people
	}
	print(json.dumps(photo, **pretty))
	photos.append(photo)

	with open('photos/data.json', 'w') as f:
	f.write(
	json.dumps(photos, **pretty)
	)

	call(["curl", driver.current_url, "--output", "photos/{}".format(filename)])
	import re

	from subprocess import call

	from helpers import scroll_to_bottom, get_driver, FB_USER_ID

	if __name__ == '__main__':
	print("-"*20 + "\nOpening Browser...")

	driver = get_driver()
	driver.get("https://www.facebook.com/{}/videos".format(FB_USER_ID))
	scroll_to_bottom(driver)

	video_links = list(map(
	lambda el: el.get_attribute("href").replace('www.', 'm.'),
	driver.find_elements_by_css_selector('ul.fbStarGrid > li > a')
	))

	for link in video_links:
	driver.get(link)

	page_source = driver.page_source

	driver.find_element_by_css_selector('[data-sigil="m-video-play-button playInlineVideo"]').click() # play video
	cdn_url = driver.find_element_by_css_selector('video').get_attribute('src')

	filename = cdn_url.split('?')[0].split('/')[-1]

	with open('videos/{}.html'.format(filename), 'w') as f:
	f.write(page_source)

	call(["curl", cdn_url, "--output", "videos/{}".format(filename)])