Skip to content

Instantly share code, notes, and snippets.

@larsks
Created November 6, 2012 04:23
Show Gist options
  • Save larsks/4022537 to your computer and use it in GitHub Desktop.
Save larsks/4022537 to your computer and use it in GitHub Desktop.
Convert Blogger posts to Markdown for use with Scriptogr.am
#!/usr/bin/python
import os
import sys
import argparse
import iso8601
import re
import subprocess
import logging
import json
import requests
from lxml import etree
from lxml.cssselect import CSSSelector
from HTMLParser import HTMLParser
namespaces = {
'atom': 'http://www.w3.org/2005/Atom',
'app': 'http://purl.org/atom/app#',
}
kind_post = 'http://schemas.google.com/blogger/2008/kind#post'
markdown_api = 'http://fuckyeahmarkdown.com/go/'
def parse_args():
p = argparse.ArgumentParser()
p.add_argument('--online', '--fuckyeah',
action='store_const', const='online', dest='converter')
p.add_argument('--pandoc',
action='store_const', const='pandoc', dest='converter')
p.add_argument('--html2text',
action='store_const', const='html2text', dest='converter')
p.add_argument('--output-dir', '-d', default='posts')
p.add_argument('input')
p.set_defaults(converter='pandoc')
return p.parse_args()
def markdownify_html2text(html):
p = subprocess.Popen(['html2text', '-d', '-b', '0', ],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
stdout, stderr = p.communicate(input=html.encode('utf-8'))
return stdout
def markdownify_pandoc(html):
p = subprocess.Popen(['pandoc', '--strict', '--normalize',
'-f', 'html', '-t', 'markdown', '-'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
stdout, stderr = p.communicate(input=html.encode('utf-8'))
return stdout
def markdownify_online(html):
r = requests.post(markdown_api,
data=dict(html=html))
return r.content
def process_entry(entry):
kind = entry.xpath(
'atom:category[@scheme="http://schemas.google.com/g/2005#kind"]',
namespaces=namespaces)[0]
if kind.get('term') != kind_post:
return
eid = entry.xpath('atom:id',
namespaces=namespaces)[0].text
title = entry.xpath('atom:title[@type="text"]',
namespaces=namespaces)[0]
title = title.text.strip().replace('\n', ' ')
title = re.sub(' +', ' ', title)
published = entry.xpath('atom:published', namespaces=namespaces)[0].text
published = iso8601.parse_date(published)
published = '%s-%s-%s' % (
published.year,
published.month,
published.day)
tags = entry.xpath(
'atom:category[@scheme="http://www.blogger.com/atom/ns#"]',
namespaces=namespaces)
tags = [ x.get('term') for x in tags ]
try:
href = entry.xpath('atom:link[@rel="alternate" and @type="text/html"]',
namespaces=namespaces)[0].get('href')
except IndexError:
logging.error('no link for id %s' % eid)
return
slug = href.split('/')[-1].replace('.html', '')
content = entry.xpath('atom:content',
namespaces=namespaces)[0].text
return dict(
id=eid,
title=title,
date=published,
tags=tags,
href=href,
content=content,
slug=slug,
)
def update_content(entry):
'''Blogger performs some odd transformations on <pre> blocks when
producing the Atom feed. Here we replace the content from the XML file
by fetching it directly from the <link> specified for the entry.'''
logging.info('Updating content from %(href)s' % entry)
r = requests.get(entry['href'])
doc = etree.fromstring(r.content,
parser = etree.HTMLParser())
content = CSSSelector('div.entry-content')(doc)[0]
entry['content'] = etree.tostring(content)
def write_entry(entry, data, opts):
# Write xml data to posts/slug.xml.
with open(os.path.join(opts.output_dir, '%s.xml' % data['slug']), 'w') as fd:
fd.write(etree.tostring(entry))
# Write HTML content to posts/slug.html
with open(os.path.join(opts.output_dir, '%s.html' % data['slug']), 'w') as fd:
fd.write(data['content'].encode('utf-8'))
if opts.converter == 'online':
mdfunc = markdownify_online
elif opts.converter == 'pandoc':
mdfunc = markdownify_pandoc
elif opts.converter == 'html2text':
mdfunc = markdownify_html2text
else:
raise ValueError('Unknown converter (%s)' % opts.converter)
# Write Markdown to posts/slug.md
md = mdfunc(data['content'].encode('utf-8'))
with open(os.path.join(opts.output_dir, '%s.md' % data['slug']), 'w') as fd:
fd.write('Title: %(title)s\n' % data)
fd.write('Date: %(date)s\n' % data)
fd.write('Tags: %s\n' % ' '.join(data['tags']))
fd.write('\n')
fd.write(md)
def main():
opts = parse_args()
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
with open(opts.input) as fd:
logging.info('parsing feed')
doc = etree.parse(fd)
for entry in doc.xpath('//atom:entry', namespaces=namespaces):
data = process_entry(entry)
if data is None:
continue
update_content(data)
write_entry(entry, data, opts)
if __name__ == '__main__':
main()
@dwlf
Copy link

dwlf commented Nov 12, 2014

What's the license on this code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment