Skip to content

Instantly share code, notes, and snippets.

@emeraldjava
Forked from larsks/blogger2scriptogram.py
Created December 3, 2019 20:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emeraldjava/2ffcb0ca73e8f74881ad4bb1033ea07e to your computer and use it in GitHub Desktop.
Save emeraldjava/2ffcb0ca73e8f74881ad4bb1033ea07e to your computer and use it in GitHub Desktop.
Convert Blogger posts to Markdown for use with Scriptogr.am
#!/usr/bin/python
import os
import sys
import argparse
import iso8601
import re
import subprocess
import logging
import json
import requests
from lxml import etree
from lxml.cssselect import CSSSelector
from HTMLParser import HTMLParser
namespaces = {
'atom': 'http://www.w3.org/2005/Atom',
'app': 'http://purl.org/atom/app#',
}
kind_post = 'http://schemas.google.com/blogger/2008/kind#post'
markdown_api = 'http://fuckyeahmarkdown.com/go/'
def parse_args():
p = argparse.ArgumentParser()
p.add_argument('--online', '--fuckyeah',
action='store_const', const='online', dest='converter')
p.add_argument('--pandoc',
action='store_const', const='pandoc', dest='converter')
p.add_argument('--html2text',
action='store_const', const='html2text', dest='converter')
p.add_argument('--output-dir', '-d', default='posts')
p.add_argument('input')
p.set_defaults(converter='pandoc')
return p.parse_args()
def markdownify_html2text(html):
p = subprocess.Popen(['html2text', '-d', '-b', '0', ],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
stdout, stderr = p.communicate(input=html.encode('utf-8'))
return stdout
def markdownify_pandoc(html):
p = subprocess.Popen(['pandoc', '--strict', '--normalize',
'-f', 'html', '-t', 'markdown', '-'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
stdout, stderr = p.communicate(input=html.encode('utf-8'))
return stdout
def markdownify_online(html):
r = requests.post(markdown_api,
data=dict(html=html))
return r.content
def process_entry(entry):
kind = entry.xpath(
'atom:category[@scheme="http://schemas.google.com/g/2005#kind"]',
namespaces=namespaces)[0]
if kind.get('term') != kind_post:
return
eid = entry.xpath('atom:id',
namespaces=namespaces)[0].text
title = entry.xpath('atom:title[@type="text"]',
namespaces=namespaces)[0]
title = title.text.strip().replace('\n', ' ')
title = re.sub(' +', ' ', title)
published = entry.xpath('atom:published', namespaces=namespaces)[0].text
published = iso8601.parse_date(published)
published = '%s-%s-%s' % (
published.year,
published.month,
published.day)
tags = entry.xpath(
'atom:category[@scheme="http://www.blogger.com/atom/ns#"]',
namespaces=namespaces)
tags = [ x.get('term') for x in tags ]
try:
href = entry.xpath('atom:link[@rel="alternate" and @type="text/html"]',
namespaces=namespaces)[0].get('href')
except IndexError:
logging.error('no link for id %s' % eid)
return
slug = href.split('/')[-1].replace('.html', '')
content = entry.xpath('atom:content',
namespaces=namespaces)[0].text
return dict(
id=eid,
title=title,
date=published,
tags=tags,
href=href,
content=content,
slug=slug,
)
def update_content(entry):
'''Blogger performs some odd transformations on <pre> blocks when
producing the Atom feed. Here we replace the content from the XML file
by fetching it directly from the <link> specified for the entry.'''
logging.info('Updating content from %(href)s' % entry)
r = requests.get(entry['href'])
doc = etree.fromstring(r.content,
parser = etree.HTMLParser())
content = CSSSelector('div.entry-content')(doc)[0]
entry['content'] = etree.tostring(content)
def write_entry(entry, data, opts):
# Write xml data to posts/slug.xml.
with open(os.path.join(opts.output_dir, '%s.xml' % data['slug']), 'w') as fd:
fd.write(etree.tostring(entry))
# Write HTML content to posts/slug.html
with open(os.path.join(opts.output_dir, '%s.html' % data['slug']), 'w') as fd:
fd.write(data['content'].encode('utf-8'))
if opts.converter == 'online':
mdfunc = markdownify_online
elif opts.converter == 'pandoc':
mdfunc = markdownify_pandoc
elif opts.converter == 'html2text':
mdfunc = markdownify_html2text
else:
raise ValueError('Unknown converter (%s)' % opts.converter)
# Write Markdown to posts/slug.md
md = mdfunc(data['content'].encode('utf-8'))
with open(os.path.join(opts.output_dir, '%s.md' % data['slug']), 'w') as fd:
fd.write('Title: %(title)s\n' % data)
fd.write('Date: %(date)s\n' % data)
fd.write('Tags: %s\n' % ' '.join(data['tags']))
fd.write('\n')
fd.write(md)
def main():
opts = parse_args()
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
with open(opts.input) as fd:
logging.info('parsing feed')
doc = etree.parse(fd)
for entry in doc.xpath('//atom:entry', namespaces=namespaces):
data = process_entry(entry)
if data is None:
continue
update_content(data)
write_entry(entry, data, opts)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment