#!/usr/bin/python
# vox2mtif
# Copyright 2009 Randy Reddig - http://ydnar.com
# Converts a Vox blog's public posts into a format suitable for importing into TypePad or Movable Type.
# http://www.movabletype.org/documentation/appendices/import-export-format.html
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# 'Software'), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import re
import sys
import time
import codecs
import locale
import getopt
# http://code.google.com/p/httplib2
import httplib2
# http://www.feedparser.org/
import feedparser
# http://www.crummy.com/software/BeautifulSoup/
from BeautifulSoup import BeautifulSoup
class VoxPop:
DEFAULT_CATEGORIES = (
'Archive',
'Vox',
'Imported',
)
TAGS_TO_CATEGORIES = {
'apple': 'Apple',
'art': 'Art',
'awesome': 'Awesome',
'bicycle': 'Cycling',
'california': 'California',
'climbing': 'Climbing',
'code': 'Code',
'cycling': 'Cycling',
'dance': 'Music',
'design': 'Design',
'electro': 'Music',
'film': 'Film',
'friends': 'Friends',
'helvetica': 'Typography',
'hawaii': 'Hawaii',
'javascript': 'JavaScript',
'new york': 'New York',
'politics': 'Politics',
'python': 'Python',
'ruby': 'Ruby',
'seattle': 'Seattle',
'six apart': 'Six Apart',
'san francisco': 'San Francisco',
'travel': 'Travel',
'typography': 'Typography',
'vacation': 'Travel',
'vox': 'Six Apart',
'ydnar': 'ydnar',
}
def __init__(self, username):
self.username = username
self.entries = []
self.h = httplib2.Http(".cache")
def __url(self):
return 'http://%s.vox.com/library/posts/atom.xml' % self.username
url = property(__url)
def fetch(self, url):
response, content = self.h.request(url)
return content
# Fetch all entries from a Vox blog.
def parse_feeds(self):
self.entries = []
next_url = self.parse_feed(self.url)
while next_url:
next_url = self.parse_feed(next_url)
# Parse a single Atom feed, add its entries, and return the next URL, if any.
def parse_feed(self, url):
print u'Parsing feed: %s' % url
data = feedparser.parse(self.fetch(url))
for entry in data.entries:
self.entries.append(entry)
for link in data.feed.links:
if link.rel == 'next':
return link.href
# Generate MT categries for each Vox tag. YMMV.
def map_entry_categories(self, entry):
print u'Mapping categories for: %s' % entry.title
categories = list(self.DEFAULT_CATEGORIES)
if entry.get('tags'):
for tag in entry.tags:
if tag.term in self.TAGS_TO_CATEGORIES and not tag.term in categories:
categories.append(self.TAGS_TO_CATEGORIES[tag.term])
entry['mapped_categories'] = categories
# Vox feed content is a subset of the actual full post content, so fetch the original.
def fetch_entry_content(self, entry):
print u'Fetching content for: %s' % entry.title
soup = BeautifulSoup(self.fetch(entry.link))
div = soup.find('div', {'class': 'asset-body'}) or soup.find('div', {'class': 'asset-body preview-links'})
entry['full_content'] = div.decodeContents()
def serialize_entries(self):
mtif = []
for entry in self.entries:
mtif.append(self.serialize_entry(entry))
return u''.join(mtif)
def serialize_entry(self, entry):
original_url = entry.link.replace('?_c=feed-atom', '')
mtif = []
mtif.append(u'TITLE: %s' % entry.title)
mtif.append(u'AUTHOR: %s' % entry.author)
mtif.append(u'DATE: %s' % time.strftime('%m/%d/%Y %H:%M:%S', entry.published_parsed))
mtif.append(u'CONVERT BREAKS: 1') # Vox content is always HTML. TypePad can mangle with its RTE unless this is set.
mtif.append(u'STATUS: publish') # Hard-coded to publish.
if entry.get('mapped_categories'):
for category in entry['mapped_categories']:
mtif.append(u'CATEGORY: %s' % category)
mtif.append(u'-----')
mtif.append(u'BODY:')
mtif.append(u'<p class="import">Originally posted to <a href="%s" rel="alternate nofollow">%s.vox.com</a> on %s.</p>' % (
original_url, self.username, time.strftime('%B %d, %Y', entry.published_parsed)))
mtif.append(entry['full_content'])
mtif.append(u'-----')
mtif.append(u'--------\n')
return u'\n'.join(mtif)
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def main(argv=None):
if argv is None:
argv = sys.argv
sys.stdout = codecs.getwriter('utf-8')(sys.stdout);
try:
try:
opts, args = getopt.getopt(argv[1:], 'h', ['help'])
if len(args) < 1:
raise Usage('No URL specified.')
except getopt.error, msg:
raise Usage(msg)
except Usage, err:
print >>sys.stderr, 'Usage: %s <username>' % argv[0]
print >>sys.stderr, err.msg
print >>sys.stderr, 'for help use --help'
return 2
vp = VoxPop(args[0])
f = codecs.getwriter('utf-8')(open('%s.mtif' % vp.username, 'wb'))
print u'\nFetching Atom feeds for %s...' % vp.username
vp.parse_feeds()
print u'Found %d posts.' % len(vp.entries)
print u'\nMapping categories...'
for entry in vp.entries:
vp.map_entry_categories(entry)
print u'\nFetching post content (this could take a while)...'
for entry in vp.entries:
vp.fetch_entry_content(entry)
f.write(vp.serialize_entry(entry))
f.flush()
f.close()
if __name__ == '__main__':
sys.exit(main())