Skip to content

Instantly share code, notes, and snippets.

@nperony
Created November 23, 2016 13:24
Show Gist options
  • Save nperony/0a423cf7da18836d9ac8a2fdc80c313d to your computer and use it in GitHub Desktop.
Save nperony/0a423cf7da18836d9ac8a2fdc80c313d to your computer and use it in GitHub Desktop.
Crawling profiles in Facebook (or Workplace) community to get list of followers
# Facebook login to get the list of members for every follower
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
def page_is_loaded(driver):
return driver.find_element_by_tag_name("body") != None
driver = webdriver.Chrome()
driver.get('https://hyperloop.facebook.com/work/landing/input/')
wait = ui.WebDriverWait(driver, 10)
wait.until(page_is_loaded);
email_field = driver.find_element_by_name('email')
email_field.send_keys('myemailaddress@server.com')
email_field.send_keys(Keys.RETURN)
wait.until(page_is_loaded);
# Then do the authentication manually (if 2FA is enabled)
### Helper functions
def get_follower_page_source(driver, member_id):
follower_url = 'https://hyperloop.facebook.com/profile.php?id=%s&sk=followers' % member_id
driver.get(follower_url)
wait.until(page_is_loaded);
return driver.page_source
from bs4 import BeautifulSoup
def get_follower_ids(follower_page_html, pattern='hyperloop.facebook.com/profile.php?id=', id_length=15):
soup = BeautifulSoup(follower_page_html, 'html.parser')
followers = soup.findAll('li', {'class': 'fbProfileBrowserListItem'})
for f in followers:
name_block = f.findNext('div', {'class': 'clearfix _42ef'})
profile_links = name_block.findAll('a')
for l in profile_links:
href = l.get('href')
if href == None:
continue
if pattern in href:
putative_id = href.split(pattern,1)[1]
putative_id = putative_id[:15]
if putative_id.isdigit() and len(putative_id) == 15:
yield putative_id
# create followers file (list of directed edges)
import datetime
follower_dicts = []
for id in tqdm.tqdm_notebook(ids_members):
source = get_follower_page_source(driver, id)
follower_ids = list(get_follower_ids(source))
follower_dicts.extend([{'id_followee': id, 'id_follower': fid} for fid in follower_ids])
print('%d follower relationships' % len(follower_dicts))
savefile = datetime.datetime.now().strftime('followers_%Y-%m-%d_%H-%M-%S.csv')
print('Creating DataFrame and saving CSV to %s' % savefile)
dff = pd.DataFrame(follower_dicts)
dff = dff[['id_followee', 'id_follower']]
dff.to_csv('data_dumps/' + savefile, index = False, encoding='utf-8')
# Bonus: create follower file with names instead of IDs
import pandas as pd
dfm = pd.read_csv('members.csv') # created by social graph crawler
dff = pd.read_csv('followers_xxx.csv') # created at step above
def get_name(dfm, id):
idx = dfm[dfm['id']==id].index.tolist()
if len(idx) == 0:
return 'Name not found'
elif len(idx) > 1:
return 'Duplicate ID %d' % id
else:
idx = idx[0]
fn = dfm.ix[idx]['first_name'] if pd.notnull(dfm.ix[idx]['first_name']) else 'FIRST_NAME'
ln = dfm.ix[idx]['last_name'] if pd.notnull(dfm.ix[idx]['last_name']) else 'LAST_NAME'
try:
return ' '.join([fn, ln])
except TypeError:
print(idx)
dffn = pd.DataFrame()
dffn['follower'] = map(lambda x: get_name(dfm, x), dff['id_follower'])
dffn['followee'] = map(lambda x: get_name(dfm, x), dff['id_followee'])
dffn = dffn.drop_duplicates() # some people have created several accounts
import csv
savefile = datetime.datetime.now().strftime('followers_withnames_%Y-%m-%d_%H-%M-%S.csv')
dffn.to_csv(savefile, index = False, encoding='utf-8', quoting = csv.QUOTE_NONNUMERIC, header = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment