Created
February 20, 2019 09:49
-
-
Save arno12/ba36b0e7e0663714f9396bf52cdfce08 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import facebook | |
import requests | |
import json | |
import datetime | |
import time | |
def unicode_normalize(text): | |
return text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22, 0xa0:0x20 }).encode('ascii','ignore') | |
def request_until_succeed(url): | |
req = requests.get(url) | |
success = False | |
while success is False: | |
try: | |
if req.status_code == 200: | |
success = True | |
except Exception as e: | |
print(e) | |
time.sleep(5) | |
print("Error for URL {}: {}".format(url, datetime.datetime.now())) | |
return req.json() | |
def getFacebookPageFeedData(page_id, access_token, num_statuses): | |
# Construct the URL string; see http://stackoverflow.com/a/37239851 for Reactions parameters | |
base = "https://graph.facebook.com/v2.6" | |
node = "/{}/posts".format(page_id) | |
fields = "/?fields=message,link,created_time,type,name,id,comments.limit(0).summary(true),shares,reactions.limit(0).summary(true)" | |
parameters = "&limit={}&access_token={}".format(num_statuses, access_token) | |
url = base + node + fields + parameters | |
data = request_until_succeed(url) | |
return data | |
def get_comments_count(post_id, ACCESS_TOKEN): | |
#create Graph API Call | |
comments_url = "https://graph.facebook.com/v2.6" + post_id + "/comments?summary=true&key=value&access_token=" + ACCESS_TOKEN | |
comments_json = requests.get(comments_url).json() | |
#pick out the likes count | |
comments_count = comments_json["summary"]["total_count"] | |
return comments_count | |
def get_shares_count(post_id, ACCESS_TOKEN): | |
shares_url = "https://graph.facebook.com/v2.6" + post_id + "/likes?summary=true&key=value&access_token=" + ACCESS_TOKEN | |
shares_json = requests.get(shares_url).json() | |
shares_count = shares_json["summary"]["total_count"] | |
return shares_count | |
def get_page_likes(post_id, ACCESS_TOKEN): | |
TAT_url = "https://graph.facebook.com/v2.6/" + post_id + "?fields=talking_about_count&access_token=" + ACCESS_TOKEN | |
TAT_json = requests.get(TAT_url).json() | |
TAT_count = TAT_json['talking_about_count'] | |
return TAT_count | |
def get_page_TAT(post_id, ACCESS_TOKEN): | |
likes_url= "https://graph.facebook.com/v2.6/" + post_id + "?fields=engagement&access_token=" + ACCESS_TOKEN | |
likes_json = requests.get(likes_url).json() | |
likes_count = likes_json['engagement']['count'] | |
return likes_count | |
def getReactionsForStatus(status_id, access_token): | |
# See http://stackoverflow.com/a/37239851 for Reactions parameters | |
# Reactions are only accessable at a single-post endpoint | |
base = "https://graph.facebook.com/v2.6/" | |
node = status_id | |
reactions = "/?fields=" \ | |
"reactions.type(LIKE).limit(0).summary(total_count).as(like)" \ | |
",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \ | |
",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \ | |
",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \ | |
",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \ | |
",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)" | |
parameters = "&access_token={}".format(access_token) | |
url = base + node + reactions + parameters | |
# retrieve data | |
#data = json.loads(request_until_succeed(url)) | |
data = request_until_succeed(url) | |
return data | |
def processFacebookPageFeedStatus(status, access_token, company): | |
# The status is now a Python dictionary, so for top-level items, | |
# we can simply call the key. | |
# Additionally, some items may not always exist, | |
# so must check for existence first | |
status_id = status['id'] | |
status_message = '' if 'message' not in status.keys() else unicode_normalize(status['message']) | |
link_name = '' if 'name' not in status.keys() else unicode_normalize(status['name']) | |
status_type = status['type'] | |
status_link = '' if 'link' not in status.keys() else unicode_normalize(status['link']) | |
# Time needs special care since a) it's in UTC and | |
# b) it's not easy to use in statistical programs. | |
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000') | |
# status_published = status_published + datetime.timedelta(hours=-5) # EST | |
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs | |
# Nested items require chaining dictionary keys. | |
num_reactions = 0 if 'reactions' not in status.keys() else status['reactions']['summary']['total_count'] | |
try: | |
num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count'] | |
except: | |
num_comments = False | |
num_shares = 0 if 'shares' not in status.keys() else status['shares']['count'] | |
# Counts of each reaction separately; good for sentiment | |
# Only check for reactions if past date of implementation: http://newsroom.fb.com/news/2016/02/reactions-now-available-globally/ | |
reactions = getReactionsForStatus(status_id, access_token) if status_published > '2016-02-24 00:00:00' else {} | |
num_likes = 0 if 'like' not in reactions.keys() else reactions['like']['summary']['total_count'] | |
num_loves = 0 if 'love' not in reactions.keys() else reactions['love']['summary']['total_count'] | |
num_wows = 0 if 'wow' not in reactions.keys() else reactions['wow']['summary']['total_count'] | |
num_hahas = 0 if 'haha' not in reactions.keys() else reactions['haha']['summary']['total_count'] | |
num_sads = 0 if 'sad' not in reactions.keys() else reactions['sad']['summary']['total_count'] | |
num_angrys = 0 if 'angry' not in reactions.keys() else reactions['angry']['summary']['total_count'] | |
company_name = company | |
# return a tuple of all processed data | |
return (status_id, status_message, link_name, status_type, status_link, | |
status_published, num_reactions, num_comments, num_shares, num_likes, | |
num_loves, num_wows, num_hahas, num_sads, num_angrys, company_name) | |
def scrapeFacebookPageFeedStatus(page_id, access_token): | |
with open('results/benchmark_facebook_statuses_{}.csv'.format(date), 'w') as file: | |
w = csv.writer(file) | |
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link", | |
"status_published", "num_reactions", "num_comments", "num_shares", "num_likes", | |
"num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys", "company_name"]) | |
num_processed = 0 # keep a count on how many we've processed | |
for company in page_id: | |
has_next_page = True | |
scrape_starttime = datetime.datetime.now() | |
print("Scraping {} Facebook Page: {}\n".format(company, scrape_starttime)) | |
statuses = getFacebookPageFeedData(company, access_token, 100) | |
processed_date = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # this is just to default the variable to something. | |
while has_next_page == True: | |
for status in statuses['data']: | |
processed_date = str(datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')) | |
if processed_date > old_date: | |
w.writerow(processFacebookPageFeedStatus(status, access_token, company)) | |
# output progress occasionally to make sure code is not stalling | |
num_processed += 1 | |
if num_processed % 100 == 0: | |
print("{} Statuses Processed from {}: {}".format(num_processed, company, datetime.datetime.now())) | |
# if there is no next page, we're done. | |
if 'paging' in statuses.keys() and processed_date > old_date: | |
statuses = request_until_succeed(statuses['paging']['next']) | |
else: | |
has_next_page = False | |
#num_processed = 0 | |
print("\nDone!\n{} Statuses Processed in {}\n".format(num_processed, datetime.datetime.now() - scrape_starttime)) | |
app_id = '' | |
app_secret = '' | |
page_id = ['Bookingcom','Expedia','TripAdvisor','Airbnb','Kayak','Hotels.comUS'] | |
access_token = app_id + "|" + app_secret | |
date = datetime.datetime.now().strftime('%Y-%m-%d') | |
old_date = datetime.datetime.strftime(datetime.datetime.now()-datetime.timedelta(days=90),'%Y-%m-%d %H:%M:%S') | |
if __name__ == '__main__': | |
scrapeFacebookPageFeedStatus(page_id, access_token) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment