Skip to content

Instantly share code, notes, and snippets.

@davidrenne
Created February 24, 2015 15:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidrenne/60e07ce7e4e237c6268b to your computer and use it in GitHub Desktop.
Save davidrenne/60e07ce7e4e237c6268b to your computer and use it in GitHub Desktop.
Weiai Wayne
#-*-coding:utf-8-*-
# This script requires an existing SQLite DB for output from Facebook Graph API, it should be stored in the same folder of this script if you do not specify the path to DB
"""
Facebook Graph API Explorer (http://developers.facebook.com/tools/explorer)
GO HERE FOR DEFINITIONS OF VARIABLES RETURNED BY API:
https://developers.facebook.com/docs/reference/api/post/
JSON Viewer (http://jsonviewer.stack.hu/)
Home:
http://curiositybits.com/python-for-mining-the-social-web/python-tutorial-mining-facebook-fan-page-getting-posts-and-comments/
"""
# import necessary Python libraries
import sys
import urllib
import string
import simplejson
import sqlite3
import time
import datetime
from datetime import datetime, date, time
from pprint import pprint
import sqlalchemy
from sqlalchemy.orm import mapper, sessionmaker
from sqlalchemy import Column, Integer, String, ForeignKey, Text, Numeric, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import update
from types import *
import re
#https://developers.facebook.com/tools/explorer
#//Retrieve auth token
#$authToken = fetchUrl("https://graph.facebook.com/oauth/access_token?grant_type=client_credentials&client_id={$app_id}&client_secret={$app_secret}");
access_token = 'CAACEdEose0cBAP6zQXvDZCyMYPZAk8iiuRfL9QqJNwC5JbX7fYZBKnqxZApZBa1BMGXHHwxvUPfo1ArjAIC6hNzS9XoXAs55d3UHEG81fW89CiZAuZCqZAIGzGCfODCsYfzhU7kcxAoJAy8pqdCPESAzVnebVZC0lLymZAzP6BWwG2meDY9qtb4KNOznHSHfnEjz15oH2pVsXDJYGJvMyWWrAttTvz3ZAop9dMZD'
#if a page name contains multiple words, separated by space (e.g. SPOT Coffee); it will show up as words connected by hyphens in URL. For example, SPOT Cofee as in "Spot-Coffee-Elmwood." If so, it is recommended that you use page id. You can find page id in the URL - it is the string of numbers after page name in URL. (e.g. https://www.facebook.com/pages/Spot-Coffee-Elmwood/316579834919)
fb_users = ['Spot-Coffee-Elmwood', 'eastloshigh']
Base = declarative_base()
###########################
class STATUS(Base):
__tablename__ = 'entries' #change table name for your preference.
id = Column(Integer, primary_key=True)
org_name = Column(String) #the name of a Facebook page; useful when you are mining content from multiple pages.
content_full = Column(String)
date = Column(String)
author = Column(String)
content_new = Column(String)
date_inserted = Column(DateTime)
original_name = Column(String)
FB_org_id = Column(String) #Page id, an unique identifier for a Facebook page.
location = Column(String)
link = Column(String) #The URL to a Facebook page post.
message_id = Column(String) #Unique identifier for a Facebook page post, formatted as 'Page ID_Message ID'
org_id = Column(String) # same as page id
status_id = Column(String, unique=True) #Unique identifier for a Facebook page post, without page ID.
status_link = Column(String) #same as link
content = Column(String) #The textual content of a post
published_date = Column(String) #When a Facebook page post is sent.
date_inserted = Column(DateTime) #When a post is retrieved and stored on your SQLite database. Literally, the time of your data-mining.
last_comment = Column(String)
type = Column(String) #The type of post – status, link, photo, video, etc.
status_type = Column(String) #More detailed status post type (e.g. mobile_status_update, added_photo)
video_source = Column(String) #Included URL to a video
picture_link = Column(String) #The name of the webpage that the included URL is linking to.
link_name = Column(String)
link_caption = Column(String)
link_description = Column(String) #Description of the webpage that the included URL is linking to.
num_mentions = Column(Integer) #The number of mentions in a post.
mentions = Column(String) #The mentioned page/people in a post.
num_likes = Column(Integer)
like_count = Column(Integer)
comment_count = Column(Integer)
share_count = Column(Integer)
hashtag_count = Column(Integer)
hashtags = Column(String)
mentions_count = Column(Integer)
actions = Column(String)
application = Column(String)
properties = Column(String)
message_output = Column(String) #The entire JSON raw output, including information not parsed to the existing columns.
time_since_post_days = Column(String)
like_count_7days = Column(Integer)
comment_count_7days = Column(Integer)
share_count_7days = Column(Integer)
time_since_post_14days = Column(Integer)
like_count_14days = Column(Integer)
comment_count_14days = Column(Integer)
share_count_14days = Column(Integer)
images = Column(Integer)
urls_count = Column(Integer)
urls_count_true = Column(Integer)
first_page_comment = Column(String) #Comments (including sender, content and posted time) on the first page.
comments_beyond_pageone = Column(String) #Comments on the second page and beyond.
def __init__(self, id, org_name, FB_org_id, original_name, content_full, date,author,content_new,location, link, message_id, org_id, status_id, status_link,
content, published_date, date_inserted, last_comment, type, status_type, video_source, picture_link,
link_name, link_caption, link_description, num_mentions, mentions, num_likes, like_count,
comment_count, share_count, hashtag_count, hashtags, mentions_count,
actions, application, properties, message_output,
time_since_post_days, like_count_7days, comment_count_7days, share_count_7days,
time_since_post_14days, like_count_14days, comment_count_14days, share_count_14days,
images, urls_count, urls_count_true, first_page_comment,comments_beyond_pageone,
):
self.org_name = org_name
self.FB_org_id = FB_org_id
self.original_name = original_name
self.location = location
self.link = link
self.message_id = message_id
self.org_id = org_id
self.content_full = content_full #added for reaction count
self.date = date #added for reaction count
self.author = author #added for reaction count
self.content_new = content_new #added for reaction count
self.date_inserted = date_inserted #added for reaction count
self.status_id = status_id
self.status_link = status_link
self.content = content
self.published_date = published_date
self.date_inserted = date_inserted
self.last_comment = last_comment
self.type = type
self.status_type = status_type
self.video_source = video_source
self.picture_link = picture_link
self.link_name = link_name
self.link_caption = link_caption
self.link_description = link_description
self.num_mentions = num_mentions
self.mentions = mentions
self.num_likes = num_likes
self.like_count = like_count
self.comment_count = comment_count
self.share_count = share_count
self.hashtag_count = hashtag_count
self.hashtags = hashtags
self.mentions_count = mentions_count
self.actions = actions
self.application = application
self.properties = properties
self.message_output = message_output
self.time_since_post_days = time_since_post_days
self.like_count_7days = like_count_7days
self.comment_count_7days = comment_count_7days
self.share_count_7days = share_count_7days
self.time_since_post_14days = time_since_post_14days
self.like_count_14days = like_count_14days
self.comment_count_14days = comment_count_14days
self.share_count_14days = share_count_14days
self.images = images
self.urls_count = urls_count
self.urls_count_true = urls_count_true
self.first_page_comment = first_page_comment #added for comments on page 1
self.comments_beyond_pageone = comments_beyond_pageone # added for comments beyond page 1
def __repr__(self):
return "<Organization, Sender('%s', '%s')>" % (self.id, self.link)
url = "https://graph.facebook.com/%s/posts?access_token=" + access_token + "&include_hidden=true" #paste your own access token here.
def get_data(kid):
try:
d = simplejson.loads(urllib.urlopen(url % (kid)).read())
except Exception, e:
print "Error reading id %s, exception: %s" % (kid, e)
return None
print "d.keys(): ", d.keys()
return d
def write_data(self, d):
date_inserted = datetime.now()
messages = d['data']
number_on_page = len(messages)
print "NUMBER OF MESSAGES IN THIS SET OF RESULTS:", len(messages)
for message in d['data']:
message_output = str(message)
org_name = message['from']['name']
FB_org_id = message['from']['id']
original_name = ''
if 'original_name' in message:
original_name = message['original_name']
if 'place' in message:
location = str(message['place'])
else:
location = ''
if 'actions' in message:
actions = str(message['actions'])
else:
actions = ''
if 'link' in message:
link = message['link']
else:
link = ''
if 'name' in message:
link_name = message['name']
else:
link_name = ''
if 'caption' in message:
link_caption = message['caption']
else:
link_caption = ''
if 'description' in message:
link_description = message['description']
else:
link_description = ''
if 'shares' in message:
num_shares = message['shares']['count']
if 'message' in message:
content = message['message']
content = content.replace('\n','')
else:
content = ''
last_comment = message['updated_time']
published_date = message['created_time']
type = message['type']
message_id = message['id']
org_id = message_id.split('_')[0]
status_id = message_id.split('_')[1]
status_link = 'https://www.facebook.com/%s/posts/%s' % (org_id, status_id)
if 'status_type' in message:
status_type = message['status_type']
else:
status_type = ''
if 'properties' in message:
properties = str(message['properties'])
else:
properties = ''
if 'application' in message:
application = str(message['application'])
else:
application = ''
if 'picture' in message:
picture_link = message['picture']
else:
picture_link = ''
if 'source' in message:
video_source = message['source']
else:
video_source = ''
first_page_comment =[]
comments_beyond_pageone =[]
if 'comments' in message:
user_comment_onepage =[] #temporary for comments on page 1.
for each_comment in message ['comments']['data']:
comment = each_comment['message']
user_comment_onepage.append (comment)
first_page_comment = user_comment_onepage
user_comment_morepages =[] #temporary for comments beyond page 1.
if 'next' in message ['comments']['paging']:
next_comment_url = message ['comments']['paging']['next']
if next_comment_url:
count = 2
while count < 80:
try:
get_more_comment = simplejson.loads(urllib.urlopen(next_comment_url).read())
for more_comments in get_more_comment['data']:
comment_plus = more_comments['message']
user_comment_morepages.append (comment_plus)
except Exception, e:
print "Error reading"
break
if not 'data' in get_more_comment:
print "ok, move on"
continue
if len(get_more_comment['data'])==0:
print "ok, move on"
continue
if 'paging' in get_more_comment:
if 'next' in get_more_comment['paging']:
next_comment_url = get_more_comment['paging']['next']
print "ok"
else:
print "okay move on"
print "ok"
break
count += 1
if count > 50:
print "WE'RE AT PAGE 50!!!!!"
break
comments_beyond_pageone = user_comment_morepages
else:
first_page_comment = user_comment_onepage
first_page_comment = string.join(first_page_comment, u"***")
comments_beyond_pageone = string.join(comments_beyond_pageone, u"***")
# Comments script ends
mentions_list = []
num_mentions = 0
if 'to' in message:
num_mentions = len(message['to']['data'])
if num_mentions !=0:
mentions_list = [i['name'] for i in message['to']['data'] if 'name' in i]
else:
mentions_list = ''
mentions = ', '.join(mentions_list)
else:
mentions = ''
upd = STATUS(None, org_name, FB_org_id, original_name, None, None,None,None,location, link, message_id, org_id, status_id, status_link,
content, published_date, date_inserted, last_comment, type, status_type, video_source, picture_link,
link_name, link_caption, link_description, num_mentions, mentions, None, None, None, None, None, None, None, actions, application, properties, message_output,
None, None, None, None, None, None, None, None, None, None, None,first_page_comment,comments_beyond_pageone,
)
self.session.add(upd)
try:
self.session.commit()
except sqlalchemy.exc.SQLAlchemyError:
self.session.rollback()
print " NOT INSERTING --> IT'S A DUPLICATE"
class Scrape:
def __init__(self):
engine = sqlalchemy.create_engine("sqlite:///curiositybits_testing.sqlite", echo=False) #enter your database name here and its file path.
Session = sessionmaker(bind=engine)
self.session = Session()
Base.metadata.create_all(engine)
def main(self):
keys = []
ids = fb_users
for feed in ids:
kid = feed
d = get_data(kid)
if not d:
print "THERE WAS NO 'D' RETURNED........MOVING TO NEXT ID"
continue ##### RETURN TO THE BEGINNING OF THE LOOP
if not 'data' in d:
print "THERE WAS NO 'D['DATA']' RETURNED........MOVING TO NEXT ID" #
continue
#Add name to original_name column
if 'data' in d:
for pk, void in enumerate(d['data']):
d['data'][pk]['original_name'] = str(feed)
if len(d['data'])==0:
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
continue ##### RETURN TO THE BEGINNING OF THE LOOP
write_data(self, d)
self.session.commit()
paging = d['paging']
if 'next' in paging:
next_page_url = paging['next']
print "HERE IS THE NEXT PAGE URL:", next_page_url
else:
print "THERE AIN'T NO NEXT PAGE FOR", feed_id
if next_page_url:
print "THERE WERE STATUSES ON THE FIRST PAGE! NOW MOVING TO GRAB EARLIER POSTS"
count = 2
while count < 100:
print "------XXXXXX------ STARTING PAGE", count
try:
d = simplejson.loads(urllib.urlopen(next_page_url).read())
except Exception, e:
print "Error reading id %s, exception: %s" % (kid, e)
break
print "d.keys(): ", d.keys() ##### d.keys(): [u'paging', u'data'] NEW KEYS IN UPDATED API FOR TWYTHON
if not d:
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
#continue ##### RETURN TO THE BEGINNING OF THE WHILE LOOP
break
if not 'data' in d:
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
print d
break ##### STOP CURRENT WHILE LOOP AND GO TO NEXT STATEMENT
if 'data' in d:
for pk, void in enumerate(d['data']):
d['data'][pk]['original_name'] = str(feed)
if len(d['data'])==0:
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
print d
break ##### STOP CURRENT WHILE LOOP AND GO TO NEXT STATEMENT
write_data(self, d)
self.session.commit()
print "------XXXXXX------ FINISHED WITH PAGE", count
if 'paging' in d:
if 'next' in d['paging']:
next_page_url = d['paging']['next']
print "AND HERE IS THE NEXT PAGE URL:", next_page_url
else:
print "THERE AIN'T NO NEXT PAGE FOR", feed_id
print "--------------> WE'VE REACHED THE LAST PAGE!!!! MOVING TO NEXT ID"
break
count += 1
if count > 100:
print "WE'RE AT PAGE 50!!!!!"
break
else:
print "THERE AIN'T NO NEXT_PAGE_URL FOR FEED_ID", feed_id, " ---- GOING ON TO NEXT ID"
self.session.commit()
self.session.close()
if __name__ == "__main__":
s = Scrape()
s.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment