siscia/extractor.py

## extractor.py
import facebook
import time
import datetime
from StringIO import StringIO
from tempfile import NamedTemporaryFile
from urllib2 import urlopen, HTTPError


def simple_request(path):
    """Fetches the given path in the Graph API.

    We translate args to a valid query string. If post_args is
    given, we send a POST request to the given path with the given
    arguments.

    """

    try:
        file = urlopen(path)
    except HTTPError, e:
        response = facebook._parse_json(e.read())
        raise facebook.GraphAPIError(response)
    try:
        fileInfo = file.info()
        if fileInfo.maintype == 'text':
            response = facebook._parse_json(file.read())
        elif fileInfo.maintype == 'image':
                mimetype = fileInfo['content-type']
                response = {
                    "data": file.read(),
                    "mime-type": mimetype,
                    "url": file.url,
                }
        else:
            raise facebook.GraphAPIError('Maintype was not text or image')
    finally:
        file.close()
    if response and isinstance(response, dict) and response.get("error"):
        raise GraphAPIError(response["error"]["type"],
                            response["error"]["message"])

    return response


type_to_fields = {
    "link" : ["id","message","picture","link","created_time","likes.fields(name,pic_square)","comments.fields(from,message)","description", "type"],
    "video" : ["id","story","link","picture","name", "description","likes.fields(pic_square,name)","comments.fields(message,id,from)","type","created_time"],
    "status" : ["id","story","story_tags","type","created_time", "status_type"],
    "photo" : ["id","message","story","story_tags","name","type","object_id","created_time","likes.fields(name,pic_square)","comments.fields(from,message)"],
    "statuses" : ["id","message","updated_time","likes.fields(id,pic_square,name)","comments.fields(message,from,created_time,like_count)"],
    "swf" : ["id","story","link","picture", "name",  "description","likes.fields(pic_square,name)","comments.fields(message,id,from)","type","created_time"]
}

def back_dates(from_epoch, days = 0):
    return from_epoch - (days * 60*60*24)

def open_image(url):
    image = NamedTemporaryFile(delete=False)
    image.write(urlopen(url).read())
    image.seek(0)
    return image

class MyDiary(facebook.GraphAPI):
    def __init__(self, token):
        super(MyDiary,self).__init__(token)

    def analyze_paging(self, with_paging):
        next_pag = simple_request(with_paging["paging"]["next"])
        if len(next_pag["data"]) > 0:
            #there is somenthing next...
            with_paging.update({"paging" : next_pag["paging"],
                                "data" : with_paging["data"].extend(next_pag["data"])})
            self.analyze_pagging(with_paging)
        with_paging = with_paging["data"]
        return with_paging

    def _cycle_to_paging(self, dict_to):
        if "likes" in dict_to:
            dict_to["likes"] = self.analyze_paging(dict_to["likes"])
        if "comments" in dict_to:
            dict_to["comments"] = self.analyze_paging(dict_to["comments"])
        return 0

    def get_pic_square_from_uid(self, uid):
        url_pic = self.fql("select pic_square from user where uid = " + uid)[0]
        return open_image(url_pic["pic_square"])

    def get_posts_id(self, since=int(back_dates(time.time(), days = 30)), untill=int(time.time()), fields=[], limit = 250):
        fields = fields + ["id", "type"]
        posts = self.request("me/posts", args = {"untill" : untill,
                                                 "since" : since,
                                                 "limit" : limit,
                                                 "fields" : fields})
        ids = [x for x in posts["data"] if self.filter_status(x)]
        return ids

    def get_element(self, element_id):
        return self.request(element_id)

    def get_user_picture(self, user_id, dimension="square"):
        response = self.request(user_id, args = {"fields" : ["picture.type(square)"]})
        return open_image(response["picture"]["data"]["url"])

    def analyze(self, object_id, fields):
        return self.request(object_id, args = {"fields" : fields})

    def get_images(self, link_dict):
        if "picture" in link_dict:
            link_dict.update({"picture" : open_image(link_dict["picture"])})
        if "source" in link_dict:
            link_dict.update({"source" : open_image(link_dict["source"])})
        if "likes" in link_dict:
            for a in link_dict["likes"]:  #["data"]
                a.update({"pic_square" : open_image(a["pic_square"])})
        if "comments" in link_dict:
            for a in link_dict["comments"]: #["data"]:
                a.update({"pic_square" : self.get_user_picture(a["from"]["id"], dimension = "square")})
        return link_dict

    def analyze_link(self, link_id):
        link = self.analyze(link_id, type_to_fields["link"])
        self._cycle_to_paging(link)
        self.get_images(link)
        return link

    def analyze_video(self, video_id):
        video = self.analyze(video_id, type_to_fields["video"])
        self._cycle_to_paging(video)
        self.get_images(video)
        return video

    def analyze_status(self, status_id):
        status = self.analyze(status_id, type_to_fields["status"])
        self.analyze_story_tags(status)
        return status

    def analyze_story_tags(self, status):
        story_tag = status["story_tags"]
        tag = [t[0] for t in story_tag.values()]
        for user in tag:
            pic = self.get_pic_square_from_uid(user["id"])
            user.update({"pic_square" : pic})
        status.update({"story_tags" : tag})
        return status

    def analyze_swf(self, swf_id):
        swf = self.analyze(swf_id, type_to_fields["swf"])
        self._cycle_to_paging(swf)
        self.get_images(swf)
        return swf

    def get_status_type(self, status_id):
        return self.analyze(status_id, ["status_type"])

    def filter_status(self, post):
        if post["type"] != "status":
            return True
        else:
            status = self.get_status_type(post["id"])
            if "status_type" in status and status["status_type"] == "approved_friend":
                return True
        return False

    def analyze_photo(self, photo_id):
        photo = self.analyze(photo_id, type_to_fields["photo"])
        source = self.request(photo["object_id"])["source"]
        photo.update({"source" : source}) #it is necessary for the get_images
        self._cycle_to_paging(photo)
        self.get_images(photo)
        return photo

    def analyze_statuses(self, statuses_id):
        statuses = self.analyze(statuses_id, type_to_fields["statuses"])
        self._cycle_to_paging(statuses)
        self.get_images(statuses)
        return statuses

    def get_statuses(self, since=int(back_dates(time.time(), days = 30)), untill=int(time.time()), fields = ["id"], limit = 25):
        statuses = self.request("me/statuses", args = {"fields" : fields,
                                                       "since" : since,
                                                       "untill" : untill,
                                                       "limit" :limit,})["data"]
        for state in statuses:
            state.update({"type" : "statuses", "created_time" : state["updated_time"]})
        return statuses

    def get_threads_id(self, since = int(back_dates(time.time(), days = 30)), untill = int(time.time())):
        ids = self.fql("SELECT thread_id FROM thread WHERE (folder_id = 0 or folder_id = 1 or folder_id = 4 or folder_id = 3) and updated_time < " + str(untill) + " and updated_time > " + str(since))
        total_id = set([x['thread_id'] for x in ids])
        return total_id

    def analyze_thread(self, thread_id, since = int(back_dates(time.time(), days = 30)), untill = int(time.time())):
        query = "SELECT body, author_id, attachment, created_time FROM message WHERE thread_id = " + str(thread_id) + " and created_time > " + str(since) + " and created_time < " + str(untill) + " ORDER BY created_time DESC"
        messages = self.fql(query)
        print query
        if len(messages) >= 20:
            self._get_more_message(messages, thread_id, since, untill)
        return messages

    def _get_more_message(self, acc, thread_id, since, untill):
        message = acc
        while len(message) >= 20:
            message = self.fql("select body, created_time from message where thread_id =  " + str(thread_id)  +  "  and created_time > " + str(since)  + " and created_time < " + str(message[-1]["created_time"])  + " order by created_time DESC")
            acc.extend(message)
        return 0

    def join_list(self, first_list, *args):
        for x in args:
            first_list.extend(x)
        return first_list

    def sort_by_date(self, to_sort):
        to_sort.sort(key = lambda k : k['created_time'])
        return to_sort

    def analyze_all_post(self, posts):
        """ post -> {"type" : type, "id" : id}"""
        fun = {
            "link" : self.analyze_link,
            "video" : self.analyze_video,
            "photo" : self.analyze_photo,
            "status" : self.analyze_status,
            "statuses" : self.analyze_statuses,
            "swf" : self.analyze_swf,
        }
        for post in posts:
            yield fun[post["type"]](post["id"])
       # all_post = [ fun[post["type"]](post["id"]) for post in posts ]
       # return all_post
	import facebook
	import time
	import datetime
	from StringIO import StringIO
	from tempfile import NamedTemporaryFile
	from urllib2 import urlopen, HTTPError


	def simple_request(path):
	"""Fetches the given path in the Graph API.

	We translate args to a valid query string. If post_args is
	given, we send a POST request to the given path with the given
	arguments.

	"""

	try:
	file = urlopen(path)
	except HTTPError, e:
	response = facebook._parse_json(e.read())
	raise facebook.GraphAPIError(response)
	try:
	fileInfo = file.info()
	if fileInfo.maintype == 'text':
	response = facebook._parse_json(file.read())
	elif fileInfo.maintype == 'image':
	mimetype = fileInfo['content-type']
	response = {
	"data": file.read(),
	"mime-type": mimetype,
	"url": file.url,
	}
	else:
	raise facebook.GraphAPIError('Maintype was not text or image')
	finally:
	file.close()
	if response and isinstance(response, dict) and response.get("error"):
	raise GraphAPIError(response["error"]["type"],
	response["error"]["message"])

	return response




	type_to_fields = {
	"link" : ["id","message","picture","link","created_time","likes.fields(name,pic_square)","comments.fields(from,message)","description", "type"],
	"video" : ["id","story","link","picture","name", "description","likes.fields(pic_square,name)","comments.fields(message,id,from)","type","created_time"],
	"status" : ["id","story","story_tags","type","created_time", "status_type"],
	"photo" : ["id","message","story","story_tags","name","type","object_id","created_time","likes.fields(name,pic_square)","comments.fields(from,message)"],
	"statuses" : ["id","message","updated_time","likes.fields(id,pic_square,name)","comments.fields(message,from,created_time,like_count)"],
	"swf" : ["id","story","link","picture", "name", "description","likes.fields(pic_square,name)","comments.fields(message,id,from)","type","created_time"]
	}

	def back_dates(from_epoch, days = 0):
	return from_epoch - (days * 606024)

	def open_image(url):
	image = NamedTemporaryFile(delete=False)
	image.write(urlopen(url).read())
	image.seek(0)
	return image

	class MyDiary(facebook.GraphAPI):
	def __init__(self, token):
	super(MyDiary,self).__init__(token)

	def analyze_paging(self, with_paging):
	next_pag = simple_request(with_paging["paging"]["next"])
	if len(next_pag["data"]) > 0:
	#there is somenthing next...
	with_paging.update({"paging" : next_pag["paging"],
	"data" : with_paging["data"].extend(next_pag["data"])})
	self.analyze_pagging(with_paging)
	with_paging = with_paging["data"]
	return with_paging

	def _cycle_to_paging(self, dict_to):
	if "likes" in dict_to:
	dict_to["likes"] = self.analyze_paging(dict_to["likes"])
	if "comments" in dict_to:
	dict_to["comments"] = self.analyze_paging(dict_to["comments"])
	return 0

	def get_pic_square_from_uid(self, uid):
	url_pic = self.fql("select pic_square from user where uid = " + uid)[0]
	return open_image(url_pic["pic_square"])

	def get_posts_id(self, since=int(back_dates(time.time(), days = 30)), untill=int(time.time()), fields=[], limit = 250):
	fields = fields + ["id", "type"]
	posts = self.request("me/posts", args = {"untill" : untill,
	"since" : since,
	"limit" : limit,
	"fields" : fields})
	ids = [x for x in posts["data"] if self.filter_status(x)]
	return ids

	def get_element(self, element_id):
	return self.request(element_id)

	def get_user_picture(self, user_id, dimension="square"):
	response = self.request(user_id, args = {"fields" : ["picture.type(square)"]})
	return open_image(response["picture"]["data"]["url"])

	def analyze(self, object_id, fields):
	return self.request(object_id, args = {"fields" : fields})

	def get_images(self, link_dict):
	if "picture" in link_dict:
	link_dict.update({"picture" : open_image(link_dict["picture"])})
	if "source" in link_dict:
	link_dict.update({"source" : open_image(link_dict["source"])})
	if "likes" in link_dict:
	for a in link_dict["likes"]: #["data"]
	a.update({"pic_square" : open_image(a["pic_square"])})
	if "comments" in link_dict:
	for a in link_dict["comments"]: #["data"]:
	a.update({"pic_square" : self.get_user_picture(a["from"]["id"], dimension = "square")})
	return link_dict

	def analyze_link(self, link_id):
	link = self.analyze(link_id, type_to_fields["link"])
	self._cycle_to_paging(link)
	self.get_images(link)
	return link

	def analyze_video(self, video_id):
	video = self.analyze(video_id, type_to_fields["video"])
	self._cycle_to_paging(video)
	self.get_images(video)
	return video

	def analyze_status(self, status_id):
	status = self.analyze(status_id, type_to_fields["status"])
	self.analyze_story_tags(status)
	return status

	def analyze_story_tags(self, status):
	story_tag = status["story_tags"]
	tag = [t[0] for t in story_tag.values()]
	for user in tag:
	pic = self.get_pic_square_from_uid(user["id"])
	user.update({"pic_square" : pic})
	status.update({"story_tags" : tag})
	return status

	def analyze_swf(self, swf_id):
	swf = self.analyze(swf_id, type_to_fields["swf"])
	self._cycle_to_paging(swf)
	self.get_images(swf)
	return swf

	def get_status_type(self, status_id):
	return self.analyze(status_id, ["status_type"])

	def filter_status(self, post):
	if post["type"] != "status":
	return True
	else:
	status = self.get_status_type(post["id"])
	if "status_type" in status and status["status_type"] == "approved_friend":
	return True
	return False

	def analyze_photo(self, photo_id):
	photo = self.analyze(photo_id, type_to_fields["photo"])
	source = self.request(photo["object_id"])["source"]
	photo.update({"source" : source}) #it is necessary for the get_images
	self._cycle_to_paging(photo)
	self.get_images(photo)
	return photo

	def analyze_statuses(self, statuses_id):
	statuses = self.analyze(statuses_id, type_to_fields["statuses"])
	self._cycle_to_paging(statuses)
	self.get_images(statuses)
	return statuses

	def get_statuses(self, since=int(back_dates(time.time(), days = 30)), untill=int(time.time()), fields = ["id"], limit = 25):
	statuses = self.request("me/statuses", args = {"fields" : fields,
	"since" : since,
	"untill" : untill,
	"limit" :limit,})["data"]
	for state in statuses:
	state.update({"type" : "statuses", "created_time" : state["updated_time"]})
	return statuses

	def get_threads_id(self, since = int(back_dates(time.time(), days = 30)), untill = int(time.time())):
	ids = self.fql("SELECT thread_id FROM thread WHERE (folder_id = 0 or folder_id = 1 or folder_id = 4 or folder_id = 3) and updated_time < " + str(untill) + " and updated_time > " + str(since))
	total_id = set([x['thread_id'] for x in ids])
	return total_id

	def analyze_thread(self, thread_id, since = int(back_dates(time.time(), days = 30)), untill = int(time.time())):
	query = "SELECT body, author_id, attachment, created_time FROM message WHERE thread_id = " + str(thread_id) + " and created_time > " + str(since) + " and created_time < " + str(untill) + " ORDER BY created_time DESC"
	messages = self.fql(query)
	print query
	if len(messages) >= 20:
	self._get_more_message(messages, thread_id, since, untill)
	return messages

	def _get_more_message(self, acc, thread_id, since, untill):
	message = acc
	while len(message) >= 20:
	message = self.fql("select body, created_time from message where thread_id = " + str(thread_id) + " and created_time > " + str(since) + " and created_time < " + str(message[-1]["created_time"]) + " order by created_time DESC")
	acc.extend(message)
	return 0

	def join_list(self, first_list, *args):
	for x in args:
	first_list.extend(x)
	return first_list

	def sort_by_date(self, to_sort):
	to_sort.sort(key = lambda k : k['created_time'])
	return to_sort

	def analyze_all_post(self, posts):
	""" post -> {"type" : type, "id" : id}"""
	fun = {
	"link" : self.analyze_link,
	"video" : self.analyze_video,
	"photo" : self.analyze_photo,
	"status" : self.analyze_status,
	"statuses" : self.analyze_statuses,
	"swf" : self.analyze_swf,
	}
	for post in posts:
	yield fun[post["type"]](post["id"])
	# all_post = [ fun[post["type"]](post["id"]) for post in posts ]
	# return all_post