arno12/Booking Facebook benchmarking

## Booking Facebook benchmarking
import csv
import facebook
import requests
import json
import datetime
import time


def unicode_normalize(text):
    return text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22, 0xa0:0x20 }).encode('ascii','ignore')


def request_until_succeed(url):
    req = requests.get(url)
    success = False
    while success is False:
        try:
            if req.status_code == 200:
                success = True
        except Exception as e:
            print(e)
            time.sleep(5)

            print("Error for URL {}: {}".format(url, datetime.datetime.now()))

    return req.json()


def getFacebookPageFeedData(page_id, access_token, num_statuses):

    # Construct the URL string; see http://stackoverflow.com/a/37239851 for Reactions parameters
    base = "https://graph.facebook.com/v2.6"
    node = "/{}/posts".format(page_id)
    fields = "/?fields=message,link,created_time,type,name,id,comments.limit(0).summary(true),shares,reactions.limit(0).summary(true)"
    parameters = "&limit={}&access_token={}".format(num_statuses, access_token)
    url = base + node + fields + parameters
    data = request_until_succeed(url)

    return data

def get_comments_count(post_id, ACCESS_TOKEN):
    #create Graph API Call
    comments_url = "https://graph.facebook.com/v2.6" + post_id + "/comments?summary=true&key=value&access_token=" + ACCESS_TOKEN
    comments_json = requests.get(comments_url).json()

    #pick out the likes count
    comments_count = comments_json["summary"]["total_count"]

    return comments_count


def get_shares_count(post_id, ACCESS_TOKEN):
    shares_url = "https://graph.facebook.com/v2.6" + post_id + "/likes?summary=true&key=value&access_token=" + ACCESS_TOKEN
    shares_json = requests.get(shares_url).json()
    shares_count = shares_json["summary"]["total_count"]
    return shares_count

def get_page_likes(post_id, ACCESS_TOKEN):

    TAT_url = "https://graph.facebook.com/v2.6/" + post_id + "?fields=talking_about_count&access_token=" + ACCESS_TOKEN
    TAT_json = requests.get(TAT_url).json()
    TAT_count = TAT_json['talking_about_count']
    return TAT_count

def get_page_TAT(post_id, ACCESS_TOKEN):

    likes_url= "https://graph.facebook.com/v2.6/" + post_id + "?fields=engagement&access_token=" + ACCESS_TOKEN
    likes_json = requests.get(likes_url).json()
    likes_count = likes_json['engagement']['count']
    return likes_count

def getReactionsForStatus(status_id, access_token):

    # See http://stackoverflow.com/a/37239851 for Reactions parameters
    # Reactions are only accessable at a single-post endpoint

    base = "https://graph.facebook.com/v2.6/"
    node = status_id
    reactions = "/?fields=" \
                    "reactions.type(LIKE).limit(0).summary(total_count).as(like)" \
                    ",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \
                    ",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \
                    ",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \
                    ",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \
                    ",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)"
    parameters = "&access_token={}".format(access_token)
    url = base + node + reactions + parameters

    # retrieve data
    #data = json.loads(request_until_succeed(url))
    data = request_until_succeed(url)

    return data


def processFacebookPageFeedStatus(status, access_token, company):

    # The status is now a Python dictionary, so for top-level items,
    # we can simply call the key.

    # Additionally, some items may not always exist,
    # so must check for existence first

    status_id = status['id']
    status_message = '' if 'message' not in status.keys() else unicode_normalize(status['message'])
    link_name = '' if 'name' not in status.keys() else unicode_normalize(status['name'])
    status_type = status['type']
    status_link = '' if 'link' not in status.keys() else unicode_normalize(status['link'])

    # Time needs special care since a) it's in UTC and
    # b) it's not easy to use in statistical programs.

    status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
    # status_published = status_published + datetime.timedelta(hours=-5) # EST
    status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs


    # Nested items require chaining dictionary keys.

    num_reactions = 0 if 'reactions' not in status.keys() else status['reactions']['summary']['total_count']
    try:
        num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
    except:
        num_comments = False
    num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']

    # Counts of each reaction separately; good for sentiment
    # Only check for reactions if past date of implementation: http://newsroom.fb.com/news/2016/02/reactions-now-available-globally/

    reactions = getReactionsForStatus(status_id, access_token) if status_published > '2016-02-24 00:00:00' else {}

    num_likes = 0 if 'like' not in reactions.keys() else reactions['like']['summary']['total_count']
    num_loves = 0 if 'love' not in reactions.keys() else reactions['love']['summary']['total_count']
    num_wows = 0 if 'wow' not in reactions.keys() else reactions['wow']['summary']['total_count']
    num_hahas = 0 if 'haha' not in reactions.keys() else reactions['haha']['summary']['total_count']
    num_sads = 0 if 'sad' not in reactions.keys() else reactions['sad']['summary']['total_count']
    num_angrys = 0 if 'angry' not in reactions.keys() else reactions['angry']['summary']['total_count']
    company_name = company


    # return a tuple of all processed data

    return (status_id, status_message, link_name, status_type, status_link,
           status_published, num_reactions, num_comments, num_shares,  num_likes,
           num_loves, num_wows, num_hahas, num_sads, num_angrys, company_name)


def scrapeFacebookPageFeedStatus(page_id, access_token):
    with open('results/benchmark_facebook_statuses_{}.csv'.format(date), 'w') as file:
        w = csv.writer(file)
        w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
           "status_published", "num_reactions", "num_comments", "num_shares", "num_likes",
           "num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys", "company_name"])


        num_processed = 0   # keep a count on how many we've processed

        for company in page_id:

            has_next_page = True
            scrape_starttime = datetime.datetime.now()

            print("Scraping {} Facebook Page: {}\n".format(company, scrape_starttime))

            statuses = getFacebookPageFeedData(company, access_token, 100)

            processed_date = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # this is just to default the variable to something.

            while has_next_page == True:

                for status in statuses['data']:

                    processed_date = str(datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000'))

                    if processed_date > old_date:

                        w.writerow(processFacebookPageFeedStatus(status, access_token, company))

                        # output progress occasionally to make sure code is not stalling
                        num_processed += 1

                        if num_processed % 100 == 0:
                            print("{} Statuses Processed from {}: {}".format(num_processed, company, datetime.datetime.now()))


                # if there is no next page, we're done.
                if 'paging' in statuses.keys() and processed_date > old_date:
                    statuses = request_until_succeed(statuses['paging']['next'])
                else:
                    has_next_page = False
                    #num_processed = 0


            print("\nDone!\n{} Statuses Processed in {}\n".format(num_processed, datetime.datetime.now() - scrape_starttime))


app_id = ''
app_secret = ''
page_id = ['Bookingcom','Expedia','TripAdvisor','Airbnb','Kayak','Hotels.comUS']
access_token = app_id + "|" + app_secret
date = datetime.datetime.now().strftime('%Y-%m-%d')
old_date = datetime.datetime.strftime(datetime.datetime.now()-datetime.timedelta(days=90),'%Y-%m-%d %H:%M:%S')

if __name__ == '__main__':
    scrapeFacebookPageFeedStatus(page_id, access_token)
	import csv
	import facebook
	import requests
	import json
	import datetime
	import time


	def unicode_normalize(text):
	return text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22, 0xa0:0x20 }).encode('ascii','ignore')


	def request_until_succeed(url):
	req = requests.get(url)
	success = False
	while success is False:
	try:
	if req.status_code == 200:
	success = True
	except Exception as e:
	print(e)
	time.sleep(5)

	print("Error for URL {}: {}".format(url, datetime.datetime.now()))

	return req.json()


	def getFacebookPageFeedData(page_id, access_token, num_statuses):

	# Construct the URL string; see http://stackoverflow.com/a/37239851 for Reactions parameters
	base = "https://graph.facebook.com/v2.6"
	node = "/{}/posts".format(page_id)
	fields = "/?fields=message,link,created_time,type,name,id,comments.limit(0).summary(true),shares,reactions.limit(0).summary(true)"
	parameters = "&limit={}&access_token={}".format(num_statuses, access_token)
	url = base + node + fields + parameters
	data = request_until_succeed(url)

	return data

	def get_comments_count(post_id, ACCESS_TOKEN):
	#create Graph API Call
	comments_url = "https://graph.facebook.com/v2.6" + post_id + "/comments?summary=true&key=value&access_token=" + ACCESS_TOKEN
	comments_json = requests.get(comments_url).json()

	#pick out the likes count
	comments_count = comments_json["summary"]["total_count"]

	return comments_count


	def get_shares_count(post_id, ACCESS_TOKEN):
	shares_url = "https://graph.facebook.com/v2.6" + post_id + "/likes?summary=true&key=value&access_token=" + ACCESS_TOKEN
	shares_json = requests.get(shares_url).json()
	shares_count = shares_json["summary"]["total_count"]
	return shares_count

	def get_page_likes(post_id, ACCESS_TOKEN):

	TAT_url = "https://graph.facebook.com/v2.6/" + post_id + "?fields=talking_about_count&access_token=" + ACCESS_TOKEN
	TAT_json = requests.get(TAT_url).json()
	TAT_count = TAT_json['talking_about_count']
	return TAT_count

	def get_page_TAT(post_id, ACCESS_TOKEN):

	likes_url= "https://graph.facebook.com/v2.6/" + post_id + "?fields=engagement&access_token=" + ACCESS_TOKEN
	likes_json = requests.get(likes_url).json()
	likes_count = likes_json['engagement']['count']
	return likes_count

	def getReactionsForStatus(status_id, access_token):

	# See http://stackoverflow.com/a/37239851 for Reactions parameters
	# Reactions are only accessable at a single-post endpoint

	base = "https://graph.facebook.com/v2.6/"
	node = status_id
	reactions = "/?fields=" \
	"reactions.type(LIKE).limit(0).summary(total_count).as(like)" \
	",reactions.type(LOVE).limit(0).summary(total_count).as(love)" \
	",reactions.type(WOW).limit(0).summary(total_count).as(wow)" \
	",reactions.type(HAHA).limit(0).summary(total_count).as(haha)" \
	",reactions.type(SAD).limit(0).summary(total_count).as(sad)" \
	",reactions.type(ANGRY).limit(0).summary(total_count).as(angry)"
	parameters = "&access_token={}".format(access_token)
	url = base + node + reactions + parameters

	# retrieve data
	#data = json.loads(request_until_succeed(url))
	data = request_until_succeed(url)

	return data


	def processFacebookPageFeedStatus(status, access_token, company):

	# The status is now a Python dictionary, so for top-level items,
	# we can simply call the key.

	# Additionally, some items may not always exist,
	# so must check for existence first

	status_id = status['id']
	status_message = '' if 'message' not in status.keys() else unicode_normalize(status['message'])
	link_name = '' if 'name' not in status.keys() else unicode_normalize(status['name'])
	status_type = status['type']
	status_link = '' if 'link' not in status.keys() else unicode_normalize(status['link'])

	# Time needs special care since a) it's in UTC and
	# b) it's not easy to use in statistical programs.

	status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
	# status_published = status_published + datetime.timedelta(hours=-5) # EST
	status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs


	# Nested items require chaining dictionary keys.

	num_reactions = 0 if 'reactions' not in status.keys() else status['reactions']['summary']['total_count']
	try:
	num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
	except:
	num_comments = False
	num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']

	# Counts of each reaction separately; good for sentiment
	# Only check for reactions if past date of implementation: http://newsroom.fb.com/news/2016/02/reactions-now-available-globally/

	reactions = getReactionsForStatus(status_id, access_token) if status_published > '2016-02-24 00:00:00' else {}

	num_likes = 0 if 'like' not in reactions.keys() else reactions['like']['summary']['total_count']
	num_loves = 0 if 'love' not in reactions.keys() else reactions['love']['summary']['total_count']
	num_wows = 0 if 'wow' not in reactions.keys() else reactions['wow']['summary']['total_count']
	num_hahas = 0 if 'haha' not in reactions.keys() else reactions['haha']['summary']['total_count']
	num_sads = 0 if 'sad' not in reactions.keys() else reactions['sad']['summary']['total_count']
	num_angrys = 0 if 'angry' not in reactions.keys() else reactions['angry']['summary']['total_count']
	company_name = company


	# return a tuple of all processed data

	return (status_id, status_message, link_name, status_type, status_link,
	status_published, num_reactions, num_comments, num_shares, num_likes,
	num_loves, num_wows, num_hahas, num_sads, num_angrys, company_name)


	def scrapeFacebookPageFeedStatus(page_id, access_token):
	with open('results/benchmark_facebook_statuses_{}.csv'.format(date), 'w') as file:
	w = csv.writer(file)
	w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
	"status_published", "num_reactions", "num_comments", "num_shares", "num_likes",
	"num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys", "company_name"])


	num_processed = 0 # keep a count on how many we've processed

	for company in page_id:

	has_next_page = True
	scrape_starttime = datetime.datetime.now()

	print("Scraping {} Facebook Page: {}\n".format(company, scrape_starttime))

	statuses = getFacebookPageFeedData(company, access_token, 100)

	processed_date = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # this is just to default the variable to something.

	while has_next_page == True:

	for status in statuses['data']:

	processed_date = str(datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000'))

	if processed_date > old_date:

	w.writerow(processFacebookPageFeedStatus(status, access_token, company))

	# output progress occasionally to make sure code is not stalling
	num_processed += 1

	if num_processed % 100 == 0:
	print("{} Statuses Processed from {}: {}".format(num_processed, company, datetime.datetime.now()))


	# if there is no next page, we're done.
	if 'paging' in statuses.keys() and processed_date > old_date:
	statuses = request_until_succeed(statuses['paging']['next'])
	else:
	has_next_page = False
	#num_processed = 0


	print("\nDone!\n{} Statuses Processed in {}\n".format(num_processed, datetime.datetime.now() - scrape_starttime))


	app_id = ''
	app_secret = ''
	page_id = ['Bookingcom','Expedia','TripAdvisor','Airbnb','Kayak','Hotels.comUS']
	access_token = app_id + "\|" + app_secret
	date = datetime.datetime.now().strftime('%Y-%m-%d')
	old_date = datetime.datetime.strftime(datetime.datetime.now()-datetime.timedelta(days=90),'%Y-%m-%d %H:%M:%S')

	if __name__ == '__main__':
	scrapeFacebookPageFeedStatus(page_id, access_token)