Skip to content
Create a gist now

Instantly share code, notes, and snippets.

from lxml import html
import urllib2
base_url = "http://{0}"
def grab_url(url):
response = urllib2.urlopen(url)
return html.fromstring(
def get_html(tree):
return html.tostring(tree)
class Blog:
posts = []
def __init__(self, blog_name):
self.blog_name = blog_name
self.blog_url = base_url.format(blog_name)
def parse_post(self, post):
post_data = {}
post_data['human_date'] = post.cssselect('')[0].text_content()
post_data['date'] = post.cssselect('abbr.published')[0].get('title')
post_data['url'] = post.cssselect('a.timestamp-link')[0].get('href')
title = post.cssselect('')
if len(title):
title = title[0]
post_data['title'] = title.text_content().replace("\n", "")
post_data['title'] = 'Thoughts'
print post_data
return post_data
def get_post_content(self, post):
document = grab_url(post['url'])
post_body = get_html(document.cssselect('')[0])
post['html'] = post_body
def get_posts(self, link=None):
url = link if link else self.blog_url
document = grab_url(url)
posts = document.cssselect('')
for post in posts:
older_link = document.cssselect('')
if len(older_link):
return self.posts

Little wrapper to grab blogger posts - needs to be asynchronous.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.