Last active
August 17, 2016 17:51
-
-
Save loganwilliams/a6ff8d035f0b57321acb1d446f0fcf99 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
import json | |
import time | |
import datetime | |
import sys | |
from wand.image import Image | |
import wand.exceptions | |
import requests | |
from StringIO import StringIO | |
from pymongo import MongoClient | |
basepath = '/Volumes/Manganese/facebook-images/' | |
username = 'williams.logan' | |
with open('cookies.json') as data_file: | |
cookies = json.load(data_file) | |
def scrollToBottom(driver): | |
attempts = 0 | |
lastheight = 0 | |
while (attempts < 15): | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
sys.stdout.write('.') | |
sleep(0.1) | |
height = driver.execute_script("return(document.body.scrollHeight);") | |
if (height == lastheight): | |
attempts += 1 | |
else: | |
attempts = 0 | |
lastheight = height | |
print('o') | |
def getUserIds(driver): | |
driver.get('https://m.facebook.com/' + username + '/friends/'); | |
scrollToBottom(driver) | |
user_ids = set() | |
user_elements = driver.find_elements_by_css_selector('._52jh a') | |
for user_element in user_elements: | |
user_link = user_element.get_attribute('href') | |
if user_link is not None: | |
user = user_link[23:].split('/')[0] | |
if (user is not 'settings') and (user is not 'findfriends'): | |
user_ids.add(user) | |
return list(user_ids) | |
def getPhotoIds(driver): | |
links = driver.find_elements_by_tag_name('a') | |
photo_ids = set() | |
for link in links: | |
if link.get_attribute('href') is not None: | |
if 'facebook.com/photo.php' in link.get_attribute('href'): | |
photo_ids.add(link.get_attribute('href').split('fbid=')[1].split('&')[0]) | |
return photo_ids | |
def getPhotoInfo(driver, photo_id): | |
photo_info = {} | |
print('https://m.facebook.com/photo.php?fbid=' + str(photo_id)) | |
driver.get('https://m.facebook.com/photo.php?fbid=' + str(photo_id)); | |
driver.save_screenshot('test.png') | |
photo_info['url'] = driver.find_element_by_link_text('View Full Size').get_attribute('href') | |
try: | |
likes = driver.find_element_by_css_selector('._1g06') | |
likes_text = likes.text | |
likes_split = likes_text.split('and ') | |
if (len(likes_split) > 1): | |
subsplit = likes_split[1].split(' others') | |
if len(subsplit) > 1: | |
num_likes = int(subsplit[0]) + len(likes_split[0].split(',')) | |
else: | |
num_likes = 2 | |
else: | |
try: | |
num_likes = int(likes_split[0]) | |
except ValueError: | |
num_likes = 1 | |
except: | |
num_likes = 0 | |
photo_info['likes'] = num_likes | |
abbr = driver.find_element_by_css_selector('._2vja abbr') | |
photo_info['timestamp'] = datetime.datetime.fromtimestamp(json.loads(abbr.get_attribute('data-store'))['time']) | |
photo_info['user'] = driver.find_element_by_css_selector('.actor-link').get_attribute('href')[23:].split('?')[0] | |
print(photo_info) | |
return photo_info | |
user_ids = getUserIds(driver) | |
images = set() | |
for year in ['2016', '2015', '2014', '2013']: | |
for user_id in user_ids: | |
driver = webdriver.PhantomJS() | |
for cookie in cookies: | |
driver.add_cookie(cookie) | |
print("Loading page for user '" + user_id + "', year " + year) | |
driver.get('https://m.facebook.com/' + user_id + '/year/' + year + '/') | |
scrollToBottom(driver) | |
photos = getPhotoIds(driver) | |
images.update(photos) | |
print(" " + str(len(photos)) + " photos added to " + str(len(images)) + " so far") | |
driver.quit() | |
photos_list = list(all_photos) | |
client = MongoClient('127.0.0.1', 3001) | |
db = client.meteor | |
i = 0 | |
finished = [] | |
for photo_id in photos_list[:]: | |
print("photo " + str(i) + "/" + str(len(all_photos))) | |
if (i % 100) == 0: | |
driver = webdriver.PhantomJS() | |
for cookie in cookies: | |
driver.add_cookie(cookie) | |
photo_info = getPhotoInfo(driver, photo_id) | |
if (i % 100) == 99: | |
driver.quit() | |
if photo_info: | |
result = db.facebook.insert_one(photo_info) | |
finished.append(photo_id) | |
i += 1 | |
expired_urls = [] | |
images = list(images) | |
for i in range(len(images)): | |
print(str(i) + "/" + str(len(images))) | |
retries = 0 | |
if 'uri' in images[i].keys(): | |
print(' already done') | |
else: | |
while retries < 2: | |
try: | |
response = requests.get(images[i]['url']) | |
with Image(file=StringIO(response.content)) as img: | |
img.auto_orient() | |
uris = {} | |
(w,h) = img.size | |
w = int(w) | |
h = int(h) | |
fname = basepath + images[i]['id'] + '.jpg' | |
uris['original'] = fname | |
img.save(filename=fname) | |
images[i]['width'] = w | |
images[i]['height'] = h | |
with img.clone() as img_clone: | |
if w > h: | |
new_width = int(((299.0)/h)*w) | |
img_clone.resize(new_width, 299) | |
img_clone.crop(int((new_width-299.0)/2), 0, width=299, height=299) | |
else: | |
new_height = int(((299.0)/w)*h) | |
img_clone.resize(299, new_height) | |
img_clone.crop(0, int((new_height-299.0)/2), width=299, height=299) | |
fname = basepath + images[i]['id'] + '_299' + '.png' | |
uris['299'] = fname | |
img_clone.save(filename=fname) | |
with img.clone() as img_clone: | |
if w > h: | |
new_width = int(((256.0)/h)*w) | |
img_clone.resize(new_width, 256) | |
img_clone.crop(int((new_width-256.0)/2), 0, width=256, height=256) | |
else: | |
new_height = int(((256.0)/w)*h) | |
img_clone.resize(256, new_height) | |
img_clone.crop(0, int((new_height-256.0)/2), width=256, height=256) | |
fname = basepath + images[i]['id'] + '_256' + '.png' | |
uris['256'] = fname | |
img_clone.save(filename=fname) | |
images[i]['uri'] = uris | |
db.facebook.update_one({'_id': images[i]['_id']}, {'$set': images[i]}) | |
break | |
except wand.exceptions.MissingDelegateError: | |
print("Missing delegate error -- possible expired URL") | |
expired_urls.append(images[i]['id']) | |
print(" " + str(len(expired_urls)) + " found expired") | |
time.sleep(0.1) | |
break | |
except requests.exceptions.SSLError: | |
print("SSL error, retry " + str(retries)) | |
time.sleep(0.1) | |
retries += 1 | |
except requests.exceptions.ConnectionError: | |
print("Connections error, retry " + str(retries)) | |
time.sleep(1) | |
retries += 1 | |
except: | |
print("Unknown error, retry " + str(retries)) | |
time.sleep(1) | |
retries += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment