Created
February 19, 2016 06:01
-
-
Save ratulotron/ba51674c05cd553d6983 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests as r | |
import csv | |
import re | |
from pprint import pprint | |
""" | |
Code written for Python 3. It has better support for UTF-8. | |
Get a page access token first, then use the following URL to get a long lived access token. Long lived access tokens are valid for 60 days. | |
https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={APP_ID}&{APP_SECRET}={PAGE_ACCESS_TOKEN} | |
""" | |
# URL template for making requests | |
GRAPH_URL = 'https://graph.facebook.com/v2.5/{OBJECT_ID}/?key=value&access_token={ACCESS_TOKEN}&fields={FIELDS}' | |
# Long lived access token | |
ACCESS_TOKEN = '' | |
# Any page ID. | |
PAGE_ID = '' | |
# emoji list to strip from comments | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
# u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
"]+", flags=re.UNICODE) | |
def get_json(object_id, fields): | |
""" | |
Grabs specified fields of object_id, then returns as a dict. | |
""" | |
response = r.get(GRAPH_URL.format(OBJECT_ID=object_id, | |
ACCESS_TOKEN=ACCESS_TOKEN, | |
FIELDS=fields)) | |
return response.json() | |
def main(): | |
videos = get_json(PAGE_ID, 'videos{comments}')['videos'] | |
""" structure of videos | |
{data:{}, paging:{}} | |
""" | |
# pprint(videos['paging']) | |
video_ids = [] # IDs of all the videos of a page | |
while(True): | |
try: | |
for video in videos['data']: | |
""" | |
{comments:{}, id:{}}, | |
.... .... .... | |
{id:{}} | |
""" | |
if 'comments' in video: | |
# only take videos that have comments | |
video_ids.append(video['id']) | |
# pprint(video, depth=2) | |
# Uses pagination to grab next 10 data. | |
videos = r.get(videos['paging']['next']).json()['videos'] | |
except: | |
break | |
print('There are total {} videos.'.format(len(video_ids))) | |
# print(video_ids) | |
for video_id in video_ids: | |
comments = get_json(video_id, 'comments')['comments'] | |
# pprint(comment_messages, depth=2) | |
# list of actual comments along with dates | |
comment_list = [] | |
while(True): | |
try: | |
for comment_messages in comments['data']: | |
# What comment_messages might look like | |
# {comments:{}, id:{}} | |
# or | |
# {id:{}} | |
# pprint(comment_messages['id'], depth=1) | |
# convert the dict to a tuple and save to comment list | |
comment_list.append((comment_messages['created_time'], | |
emoji_pattern.sub(r'', comment_messages['message']))) | |
comments = r.get(comments['paging']['next']).json() | |
except: | |
break | |
print('Video {} has {} comments.'.format(video_id, len(comment_list))) | |
with open(video_id+'.csv', 'wt', encoding='utf8') as f: | |
writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC) | |
for row in comment_list: | |
writer.writerow(row) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment