Skip to content

Instantly share code, notes, and snippets.

@sousatg
Created February 10, 2016 14:22
Show Gist options
  • Save sousatg/aedb8c4d84282f40b8d8 to your computer and use it in GitHub Desktop.
Save sousatg/aedb8c4d84282f40b8d8 to your computer and use it in GitHub Desktop.
from splinter import Browser
from lxml import etree
import json
# recebe um objecto etree e devolve uma string com codigo html
def get_page_posts(article):
article_text = article.xpath('.//div[contains(@class, "userContent")]')[0]
a = etree.tostring(article_text, pretty_print=True).encode('utf-8')
return a
# get facebook articles from several websites
def facebook( urls ):
b = Browser()
result = facebook_aux( b, urls )
b.quit()
return result
# funcao auxiliar para percorrer a lista de urls fornecida
def facebook_aux (b, urls):
# if the urls list is empty retunrs a empty list
if len( urls ) == 0:
return []
url = urls.pop()
b.visit( url )
cenas = {
'origin' : 'facebook',
'posts' : map( get_page_posts, get_facebook_articles( b.html )),
'url' : url
}
return ( [cenas] + facebook_aux( b, urls ) )
# Gets the facebook posts from a html string
def get_facebook_articles(html):
html_doc = html
page_obj = etree.HTML( html_doc )
return page_obj.xpath('//div[contains(@class, "userContentWrapper")]')
# guarda os posts mais recentes de paginas facebook no ficheiro output.json
if __name__ == '__main__':
urls = [
'http://www.facebook.com/cesiuminho',
'http://www.facebook.com/niaeufp',
'https://www.facebook.com/techinsider',
'https://www.facebook.com/TheHowToGeek/',
'https://www.facebook.com/mooclab.club/',
'https://www.facebook.com/pythonporto/'
]
facebook_articles = facebook(urls)
fh = open( 'output.json', 'wb' )
json.dump(facebook_articles, fh)
fh.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment