public
Created

  • Download Gist
pyblogger.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
from lxml import html
import urllib2
 
base_url = "http://{0}.blogspot.com"
 
def grab_url(url):
response = urllib2.urlopen(url)
return html.fromstring(response.read())
 
def get_html(tree):
return html.tostring(tree)
 
class Blog:
posts = []
def __init__(self, blog_name):
self.blog_name = blog_name
self.blog_url = base_url.format(blog_name)
 
def parse_post(self, post):
post_data = {}
post_data['human_date'] = post.cssselect('h2.date-header')[0].text_content()
post_data['date'] = post.cssselect('abbr.published')[0].get('title')
post_data['url'] = post.cssselect('a.timestamp-link')[0].get('href')
title = post.cssselect('h3.post-title')
if len(title):
title = title[0]
post_data['title'] = title.text_content().replace("\n", "")
else:
post_data['title'] = 'Thoughts'
self.get_post_content(post_data)
print post_data
return post_data
 
def get_post_content(self, post):
document = grab_url(post['url'])
post_body = get_html(document.cssselect('div.post-body')[0])
post['html'] = post_body
 
def get_posts(self, link=None):
url = link if link else self.blog_url
document = grab_url(url)
posts = document.cssselect('div.date-outer')
for post in posts:
self.posts.append(self.parse_post(post))
 
older_link = document.cssselect('a.blog-pager-older-link')
if len(older_link):
self.get_posts(older_link[0].get('href'))
 
return self.posts

Little wrapper to grab blogger posts - needs to be asynchronous.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.