Create a gist now

Instantly share code, notes, and snippets.

from lxml import html
import urllib2
base_url = "http://{0}"
def grab_url(url):
response = urllib2.urlopen(url)
return html.fromstring(
def get_html(tree):
return html.tostring(tree)
class Blog:
posts = []
def __init__(self, blog_name):
self.blog_name = blog_name
self.blog_url = base_url.format(blog_name)
def parse_post(self, post):
post_data = {}
post_data['human_date'] = post.cssselect('')[0].text_content()
post_data['date'] = post.cssselect('abbr.published')[0].get('title')
post_data['url'] = post.cssselect('a.timestamp-link')[0].get('href')
title = post.cssselect('')
if len(title):
title = title[0]
post_data['title'] = title.text_content().replace("\n", "")
post_data['title'] = 'Thoughts'
print post_data
return post_data
def get_post_content(self, post):
document = grab_url(post['url'])
post_body = get_html(document.cssselect('')[0])
post['html'] = post_body
def get_posts(self, link=None):
url = link if link else self.blog_url
document = grab_url(url)
posts = document.cssselect('')
for post in posts:
older_link = document.cssselect('')
if len(older_link):
return self.posts

Little wrapper to grab blogger posts - needs to be asynchronous.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment