Skip to content

Instantly share code, notes, and snippets.

@jeffgodwyll
Created January 13, 2015 00:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeffgodwyll/1413094a3627bd412641 to your computer and use it in GitHub Desktop.
Save jeffgodwyll/1413094a3627bd412641 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
'''
Convert Blogger xml-exported file to markdown
Primarily from https://gist.github.com/larsks/4022537 with some modifications
'''
import os
import sys
import argparse
import iso8601
import re
import subprocess
import logging
from lxml import etree
reload(sys)
sys.setdefaultencoding("utf-8")
namespaces = {
'atom': 'http://www.w3.org/2005/Atom',
'app': 'http://purl.org/atom/app#',
}
kind_post = 'http://schemas.google.com/blogger/2008/kind#post'
if not os.path.exists('posts'):
os.makedirs('posts')
def parse_args():
p = argparse.ArgumentParser()
p.add_argument('--html2text',
action='store_const', const='html2text', dest='converter')
p.add_argument('--output-dir', '-d', default='posts')
p.add_argument('input')
p.set_defaults(converter='html2text')
return p.parse_args()
def markdownify_html2text(html):
p = subprocess.Popen(['html2text', '-d', '-b', '0', ],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
stdout, stderr = p.communicate(input=html.decode('utf-8'))
return stdout
def process_entry(entry):
kind = entry.xpath(
'atom:category[@scheme="http://schemas.google.com/g/2005#kind"]',
namespaces=namespaces)[0]
if kind.get('term') != kind_post:
return
eid = entry.xpath('atom:id',
namespaces=namespaces)[0].text
title = entry.xpath('atom:title[@type="text"]',
namespaces=namespaces)[0]
title = title.text.strip().replace('\n', ' ')
title = re.sub(' +', ' ', title)
published = entry.xpath('atom:published', namespaces=namespaces)[0].text
published = iso8601.parse_date(published)
published = '%s-%s-%s' % (
published.year,
published.month,
published.day)
tags = entry.xpath(
'atom:category[@scheme="http://www.blogger.com/atom/ns#"]',
namespaces=namespaces)
tags = [x.get('term') for x in tags]
try:
href = entry.xpath(
'atom:link[@rel="alternate" and @type="text/html"]',
namespaces=namespaces)[0].get('href')
except IndexError:
logging.error('no link for id %s' % eid)
return
slug = href.split('/')[-1].replace('.html', '')
content = entry.xpath('atom:content',
namespaces=namespaces)[0].text
return dict(
id=eid,
title=title,
date=published,
tags=tags,
href=href,
content=content,
slug=slug,
)
def write_entry(entry, data, opts):
if opts.converter == 'html2text':
mdfunc = markdownify_html2text
else:
raise ValueError('Unknown converter (%s)' % opts.converter)
# Write Markdown to posts/slug.md
md = mdfunc(data['content'].encode('utf-8'))
with open(os.path.join(opts.output_dir, '%s.md' % data['slug']), 'w') as fd:
fd.write('title: %(title)s\n' % data)
fd.write('date: %(date)s\n' % data)
fd.write('published: true\n')
fd.write('tags: [%s]\n' % ', '.join(data['tags']))
fd.write('\n')
fd.write(md)
def main():
opts = parse_args()
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
with open(opts.input) as fd:
logging.info('parsing feed')
doc = etree.parse(fd)
for entry in doc.xpath('//atom:entry', namespaces=namespaces):
data = process_entry(entry)
if data is None:
continue
write_entry(entry, data, opts)
if __name__ == '__main__':
main()
@saviour123
Copy link

Can you please add comments

@jeffgodwyll
Copy link
Author

@saviour123, What parts do you need help with?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment