ratulotron/Video comment grabber for Facebook Pages.py

## Video comment grabber for Facebook Pages.py
#!/usr/bin/env python3
import requests as r
import csv
import re
from pprint import pprint


"""
Code written for Python 3. It has better support for UTF-8.
Get a page access token first, then use the following URL to get a long lived access token. Long lived access tokens are valid for 60 days.

https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={APP_ID}&{APP_SECRET}={PAGE_ACCESS_TOKEN}
"""

# URL template for making requests
GRAPH_URL = 'https://graph.facebook.com/v2.5/{OBJECT_ID}/?key=value&access_token={ACCESS_TOKEN}&fields={FIELDS}'

# Long lived access token
ACCESS_TOKEN = ''

# Any page ID.
PAGE_ID = ''

# emoji list to strip from comments
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        # u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)


def get_json(object_id, fields):
    """
    Grabs specified fields of object_id, then returns as a dict.
    """
    response = r.get(GRAPH_URL.format(OBJECT_ID=object_id,
                               ACCESS_TOKEN=ACCESS_TOKEN,
                               FIELDS=fields))
    return response.json()


def main():
    videos = get_json(PAGE_ID, 'videos{comments}')['videos']
    """ structure of videos
    {data:{}, paging:{}}
    """

    # pprint(videos['paging'])
    video_ids = []      # IDs of all the videos of a page
    while(True):
        try:
            for video in videos['data']:
                """
                {comments:{}, id:{}},
                ....    ....    ....
                 {id:{}}
                """

                if 'comments' in video:
                    # only take videos that have comments
                    video_ids.append(video['id'])
                # pprint(video, depth=2)

            # Uses pagination to grab next 10 data.
            videos = r.get(videos['paging']['next']).json()['videos']
        except:
            break
    print('There are total {} videos.'.format(len(video_ids)))

    # print(video_ids)

    for video_id in video_ids:
        comments = get_json(video_id, 'comments')['comments']
        # pprint(comment_messages, depth=2)
        # list of actual comments along with dates
        comment_list = []
        while(True):
            try:
                for comment_messages in comments['data']:
                    # What comment_messages might look like
                    # {comments:{}, id:{}}
                    #   or
                    # {id:{}}
                    # pprint(comment_messages['id'], depth=1)
                    # convert the dict to a tuple and save to comment list
                    comment_list.append((comment_messages['created_time'],
                                     emoji_pattern.sub(r'', comment_messages['message'])))
                comments = r.get(comments['paging']['next']).json()
            except:
                break
        print('Video {} has {} comments.'.format(video_id, len(comment_list)))
        with open(video_id+'.csv', 'wt', encoding='utf8') as f:
            writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
            for row in comment_list:
                writer.writerow(row)


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import requests as r
	import csv
	import re
	from pprint import pprint


	"""
	Code written for Python 3. It has better support for UTF-8.
	Get a page access token first, then use the following URL to get a long lived access token. Long lived access tokens are valid for 60 days.

	https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={APP_ID}&{APP_SECRET}={PAGE_ACCESS_TOKEN}
	"""

	# URL template for making requests
	GRAPH_URL = 'https://graph.facebook.com/v2.5/{OBJECT_ID}/?key=value&access_token={ACCESS_TOKEN}&fields={FIELDS}'

	# Long lived access token
	ACCESS_TOKEN = ''

	# Any page ID.
	PAGE_ID = ''

	# emoji list to strip from comments
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	# u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	"]+", flags=re.UNICODE)


	def get_json(object_id, fields):
	"""
	Grabs specified fields of object_id, then returns as a dict.
	"""
	response = r.get(GRAPH_URL.format(OBJECT_ID=object_id,
	ACCESS_TOKEN=ACCESS_TOKEN,
	FIELDS=fields))
	return response.json()


	def main():
	videos = get_json(PAGE_ID, 'videos{comments}')['videos']
	""" structure of videos
	{data:{}, paging:{}}
	"""

	# pprint(videos['paging'])
	video_ids = [] # IDs of all the videos of a page
	while(True):
	try:
	for video in videos['data']:
	"""
	{comments:{}, id:{}},
	.... .... ....
	{id:{}}
	"""

	if 'comments' in video:
	# only take videos that have comments
	video_ids.append(video['id'])
	# pprint(video, depth=2)

	# Uses pagination to grab next 10 data.
	videos = r.get(videos['paging']['next']).json()['videos']
	except:
	break
	print('There are total {} videos.'.format(len(video_ids)))

	# print(video_ids)

	for video_id in video_ids:
	comments = get_json(video_id, 'comments')['comments']
	# pprint(comment_messages, depth=2)
	# list of actual comments along with dates
	comment_list = []
	while(True):
	try:
	for comment_messages in comments['data']:
	# What comment_messages might look like
	# {comments:{}, id:{}}
	# or
	# {id:{}}
	# pprint(comment_messages['id'], depth=1)
	# convert the dict to a tuple and save to comment list
	comment_list.append((comment_messages['created_time'],
	emoji_pattern.sub(r'', comment_messages['message'])))
	comments = r.get(comments['paging']['next']).json()
	except:
	break
	print('Video {} has {} comments.'.format(video_id, len(comment_list)))
	with open(video_id+'.csv', 'wt', encoding='utf8') as f:
	writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
	for row in comment_list:
	writer.writerow(row)


	if __name__ == "__main__":
	main()