sousatg/agregador_facebook.py

## agregador_facebook.py
from splinter import Browser
from lxml import etree
import json

# recebe um objecto etree e devolve uma string com codigo html
def get_page_posts(article):
	article_text = article.xpath('.//div[contains(@class, "userContent")]')[0]
	a = etree.tostring(article_text, pretty_print=True).encode('utf-8')
	return a


# get facebook articles from several websites
def facebook( urls ):
	b = Browser()
	result = facebook_aux( b, urls )
	b.quit()
	return result

# funcao auxiliar para percorrer a lista de urls fornecida
def facebook_aux (b, urls):
	# if the urls list is empty retunrs a empty list
	if len( urls ) == 0:
		return []

	url = urls.pop()
	b.visit( url )

	cenas = {
		'origin' : 'facebook',
		'posts' : map( get_page_posts, get_facebook_articles( b.html )),
		'url' : url
	}

	return ( [cenas] + facebook_aux( b, urls ) )

# Gets the facebook posts from a html string
def get_facebook_articles(html):
	html_doc = html
	page_obj = etree.HTML( html_doc )
	return page_obj.xpath('//div[contains(@class, "userContentWrapper")]')

# guarda os posts mais recentes de paginas facebook no ficheiro output.json
if __name__ == '__main__':
	urls = [
		'http://www.facebook.com/cesiuminho',
		'http://www.facebook.com/niaeufp',
		'https://www.facebook.com/techinsider',
		'https://www.facebook.com/TheHowToGeek/',
		'https://www.facebook.com/mooclab.club/',
		'https://www.facebook.com/pythonporto/'
	]
	facebook_articles = facebook(urls)

	fh = open( 'output.json', 'wb' )
	json.dump(facebook_articles, fh)
	fh.close()
	from splinter import Browser
	from lxml import etree
	import json

	# recebe um objecto etree e devolve uma string com codigo html
	def get_page_posts(article):
	article_text = article.xpath('.//div[contains(@class, "userContent")]')[0]
	a = etree.tostring(article_text, pretty_print=True).encode('utf-8')
	return a


	# get facebook articles from several websites
	def facebook( urls ):
	b = Browser()
	result = facebook_aux( b, urls )
	b.quit()
	return result

	# funcao auxiliar para percorrer a lista de urls fornecida
	def facebook_aux (b, urls):
	# if the urls list is empty retunrs a empty list
	if len( urls ) == 0:
	return []

	url = urls.pop()
	b.visit( url )

	cenas = {
	'origin' : 'facebook',
	'posts' : map( get_page_posts, get_facebook_articles( b.html )),
	'url' : url
	}

	return ( [cenas] + facebook_aux( b, urls ) )

	# Gets the facebook posts from a html string
	def get_facebook_articles(html):
	html_doc = html
	page_obj = etree.HTML( html_doc )
	return page_obj.xpath('//div[contains(@class, "userContentWrapper")]')

	# guarda os posts mais recentes de paginas facebook no ficheiro output.json
	if __name__ == '__main__':
	urls = [
	'http://www.facebook.com/cesiuminho',
	'http://www.facebook.com/niaeufp',
	'https://www.facebook.com/techinsider',
	'https://www.facebook.com/TheHowToGeek/',
	'https://www.facebook.com/mooclab.club/',
	'https://www.facebook.com/pythonporto/'
	]
	facebook_articles = facebook(urls)

	fh = open( 'output.json', 'wb' )
	json.dump(facebook_articles, fh)
	fh.close()