Skip to content

Instantly share code, notes, and snippets.

@thedjpetersen
Created October 2, 2012 05:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thedjpetersen/3816489 to your computer and use it in GitHub Desktop.
Save thedjpetersen/3816489 to your computer and use it in GitHub Desktop.
from lxml import html
import urllib2
base_url = "http://{0}.blogspot.com"
def grab_url(url):
response = urllib2.urlopen(url)
return html.fromstring(response.read())
def get_html(tree):
return html.tostring(tree)
class Blog:
posts = []
def __init__(self, blog_name):
self.blog_name = blog_name
self.blog_url = base_url.format(blog_name)
def parse_post(self, post):
post_data = {}
post_data['human_date'] = post.cssselect('h2.date-header')[0].text_content()
post_data['date'] = post.cssselect('abbr.published')[0].get('title')
post_data['url'] = post.cssselect('a.timestamp-link')[0].get('href')
title = post.cssselect('h3.post-title')
if len(title):
title = title[0]
post_data['title'] = title.text_content().replace("\n", "")
else:
post_data['title'] = 'Thoughts'
self.get_post_content(post_data)
print post_data
return post_data
def get_post_content(self, post):
document = grab_url(post['url'])
post_body = get_html(document.cssselect('div.post-body')[0])
post['html'] = post_body
def get_posts(self, link=None):
url = link if link else self.blog_url
document = grab_url(url)
posts = document.cssselect('div.date-outer')
for post in posts:
self.posts.append(self.parse_post(post))
older_link = document.cssselect('a.blog-pager-older-link')
if len(older_link):
self.get_posts(older_link[0].get('href'))
return self.posts
@thedjpetersen
Copy link
Author

Little wrapper to grab blogger posts - needs to be asynchronous.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment