Skip to content

@thedjpetersen /pyblogger.py
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
from lxml import html
import urllib2
base_url = "http://{0}.blogspot.com"
def grab_url(url):
response = urllib2.urlopen(url)
return html.fromstring(response.read())
def get_html(tree):
return html.tostring(tree)
class Blog:
posts = []
def __init__(self, blog_name):
self.blog_name = blog_name
self.blog_url = base_url.format(blog_name)
def parse_post(self, post):
post_data = {}
post_data['human_date'] = post.cssselect('h2.date-header')[0].text_content()
post_data['date'] = post.cssselect('abbr.published')[0].get('title')
post_data['url'] = post.cssselect('a.timestamp-link')[0].get('href')
title = post.cssselect('h3.post-title')
if len(title):
title = title[0]
post_data['title'] = title.text_content().replace("\n", "")
else:
post_data['title'] = 'Thoughts'
self.get_post_content(post_data)
print post_data
return post_data
def get_post_content(self, post):
document = grab_url(post['url'])
post_body = get_html(document.cssselect('div.post-body')[0])
post['html'] = post_body
def get_posts(self, link=None):
url = link if link else self.blog_url
document = grab_url(url)
posts = document.cssselect('div.date-outer')
for post in posts:
self.posts.append(self.parse_post(post))
older_link = document.cssselect('a.blog-pager-older-link')
if len(older_link):
self.get_posts(older_link[0].get('href'))
return self.posts
@thedjpetersen

Little wrapper to grab blogger posts - needs to be asynchronous.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.