Skip to content

Instantly share code, notes, and snippets.

@arno12
Created February 20, 2019 09:49
Show Gist options
  • Save arno12/ba36b0e7e0663714f9396bf52cdfce08 to your computer and use it in GitHub Desktop.
Save arno12/ba36b0e7e0663714f9396bf52cdfce08 to your computer and use it in GitHub Desktop.
import csv
import facebook
import requests
import json
import datetime
import time
def unicode_normalize(text):
return text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22, 0xa0:0x20 }).encode('ascii','ignore')
def request_until_succeed(url):
req = requests.get(url)
success = False
while success is False:
try:
if req.status_code == 200:
success = True
except Exception as e:
print(e)
time.sleep(5)
print("Error for URL {}: {}".format(url, datetime.datetime.now()))
return req.json()
def getFacebookPageFeedData(page_id, access_token, num_statuses):
# Construct the URL string; see http://stackoverflow.com/a/37239851 for Reactions parameters
base = "https://graph.facebook.com/v2.6"
node = "/{}/posts".format(page_id)
fields = "/?fields=message,link,created_time,type,name,id,comments.limit(0).summary(true),shares,reactions.limit(0).summary(true)"
parameters = "&limit={}&access_token={}".format(num_statuses, access_token)
url = base + node + fields + parameters
data = request_until_succeed(url)
return data
def get_comments_count(post_id, ACCESS_TOKEN):
#create Graph API Call
comments_url = "https://graph.facebook.com/v2.6" + post_id + "/comments?summary=true&key=value&access_token=" + ACCESS_TOKEN
comments_json = requests.get(comments_url).json()
#pick out the likes count
comments_count = comments_json["summary"]["total_count"]
return comments_count
def get_shares_count(post_id, ACCESS_TOKEN):
shares_url = "https://graph.facebook.com/v2.6" + post_id + "/likes?summary=true&key=value&access_token=" + ACCESS_TOKEN
shares_json = requests.get(shares_url).json()
shares_count = shares_json["summary"]["total_count"]
return shares_count
def get_page_likes(post_id, ACCESS_TOKEN):
TAT_url = "https://graph.facebook.com/v2.6/" + post_id + "?fields=talking_about_count&access_token=" + ACCESS_TOKEN
TAT_json = requests.get(TAT_url).json()
TAT_count = TAT_json['talking_about_count']
return TAT_count
def get_page_TAT(post_id, ACCESS_TOKEN):
likes_url= "https://graph.facebook.com/v2.6/" + post_id + "?fields=engagement&access_token=" + ACCESS_TOKEN
likes_json = requests.get(likes_url).json()
likes_count = likes_json['engagement']['count']
return likes_count
def getReactionsForStatus(status_id, access_token):
# See http://stackoverflow.com/a/37239851 for Reactions parameters
# Reactions are only accessable at a single-post endpoint
base = "https://graph.facebook.com/v2.6/"
node = status_id
reactions = "/?fields=" \
"reactions.type(LIKE).limit(0).summary(total_count).as(like)" \
",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \
",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \
",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \
",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \
",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)"
parameters = "&access_token={}".format(access_token)
url = base + node + reactions + parameters
# retrieve data
#data = json.loads(request_until_succeed(url))
data = request_until_succeed(url)
return data
def processFacebookPageFeedStatus(status, access_token, company):
# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.
# Additionally, some items may not always exist,
# so must check for existence first
status_id = status['id']
status_message = '' if 'message' not in status.keys() else unicode_normalize(status['message'])
link_name = '' if 'name' not in status.keys() else unicode_normalize(status['name'])
status_type = status['type']
status_link = '' if 'link' not in status.keys() else unicode_normalize(status['link'])
# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
# status_published = status_published + datetime.timedelta(hours=-5) # EST
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
# Nested items require chaining dictionary keys.
num_reactions = 0 if 'reactions' not in status.keys() else status['reactions']['summary']['total_count']
try:
num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
except:
num_comments = False
num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
# Counts of each reaction separately; good for sentiment
# Only check for reactions if past date of implementation: http://newsroom.fb.com/news/2016/02/reactions-now-available-globally/
reactions = getReactionsForStatus(status_id, access_token) if status_published > '2016-02-24 00:00:00' else {}
num_likes = 0 if 'like' not in reactions.keys() else reactions['like']['summary']['total_count']
num_loves = 0 if 'love' not in reactions.keys() else reactions['love']['summary']['total_count']
num_wows = 0 if 'wow' not in reactions.keys() else reactions['wow']['summary']['total_count']
num_hahas = 0 if 'haha' not in reactions.keys() else reactions['haha']['summary']['total_count']
num_sads = 0 if 'sad' not in reactions.keys() else reactions['sad']['summary']['total_count']
num_angrys = 0 if 'angry' not in reactions.keys() else reactions['angry']['summary']['total_count']
company_name = company
# return a tuple of all processed data
return (status_id, status_message, link_name, status_type, status_link,
status_published, num_reactions, num_comments, num_shares, num_likes,
num_loves, num_wows, num_hahas, num_sads, num_angrys, company_name)
def scrapeFacebookPageFeedStatus(page_id, access_token):
with open('results/benchmark_facebook_statuses_{}.csv'.format(date), 'w') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_reactions", "num_comments", "num_shares", "num_likes",
"num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys", "company_name"])
num_processed = 0 # keep a count on how many we've processed
for company in page_id:
has_next_page = True
scrape_starttime = datetime.datetime.now()
print("Scraping {} Facebook Page: {}\n".format(company, scrape_starttime))
statuses = getFacebookPageFeedData(company, access_token, 100)
processed_date = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # this is just to default the variable to something.
while has_next_page == True:
for status in statuses['data']:
processed_date = str(datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000'))
if processed_date > old_date:
w.writerow(processFacebookPageFeedStatus(status, access_token, company))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 100 == 0:
print("{} Statuses Processed from {}: {}".format(num_processed, company, datetime.datetime.now()))
# if there is no next page, we're done.
if 'paging' in statuses.keys() and processed_date > old_date:
statuses = request_until_succeed(statuses['paging']['next'])
else:
has_next_page = False
#num_processed = 0
print("\nDone!\n{} Statuses Processed in {}\n".format(num_processed, datetime.datetime.now() - scrape_starttime))
app_id = ''
app_secret = ''
page_id = ['Bookingcom','Expedia','TripAdvisor','Airbnb','Kayak','Hotels.comUS']
access_token = app_id + "|" + app_secret
date = datetime.datetime.now().strftime('%Y-%m-%d')
old_date = datetime.datetime.strftime(datetime.datetime.now()-datetime.timedelta(days=90),'%Y-%m-%d %H:%M:%S')
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment