Created
February 10, 2016 14:22
-
-
Save sousatg/aedb8c4d84282f40b8d8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from splinter import Browser | |
from lxml import etree | |
import json | |
# recebe um objecto etree e devolve uma string com codigo html | |
def get_page_posts(article): | |
article_text = article.xpath('.//div[contains(@class, "userContent")]')[0] | |
a = etree.tostring(article_text, pretty_print=True).encode('utf-8') | |
return a | |
# get facebook articles from several websites | |
def facebook( urls ): | |
b = Browser() | |
result = facebook_aux( b, urls ) | |
b.quit() | |
return result | |
# funcao auxiliar para percorrer a lista de urls fornecida | |
def facebook_aux (b, urls): | |
# if the urls list is empty retunrs a empty list | |
if len( urls ) == 0: | |
return [] | |
url = urls.pop() | |
b.visit( url ) | |
cenas = { | |
'origin' : 'facebook', | |
'posts' : map( get_page_posts, get_facebook_articles( b.html )), | |
'url' : url | |
} | |
return ( [cenas] + facebook_aux( b, urls ) ) | |
# Gets the facebook posts from a html string | |
def get_facebook_articles(html): | |
html_doc = html | |
page_obj = etree.HTML( html_doc ) | |
return page_obj.xpath('//div[contains(@class, "userContentWrapper")]') | |
# guarda os posts mais recentes de paginas facebook no ficheiro output.json | |
if __name__ == '__main__': | |
urls = [ | |
'http://www.facebook.com/cesiuminho', | |
'http://www.facebook.com/niaeufp', | |
'https://www.facebook.com/techinsider', | |
'https://www.facebook.com/TheHowToGeek/', | |
'https://www.facebook.com/mooclab.club/', | |
'https://www.facebook.com/pythonporto/' | |
] | |
facebook_articles = facebook(urls) | |
fh = open( 'output.json', 'wb' ) | |
json.dump(facebook_articles, fh) | |
fh.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment