Skip to content

Instantly share code, notes, and snippets.

Created May 14, 2015 15:50
Show Gist options
  • Save joshcartme/5f13fc525189b5d22112 to your computer and use it in GitHub Desktop.
Save joshcartme/5f13fc525189b5d22112 to your computer and use it in GitHub Desktop.
Updated Mezzanine to handle pub_date conversion error
from __future__ import unicode_literals
from future.builtins import int
from collections import defaultdict
from datetime import datetime, timedelta
from optparse import make_option
import re
from time import mktime, timezone
from xml.dom.minidom import parse
from import CommandError
from django.utils.html import linebreaks
from import BaseImporterCommand
class Command(BaseImporterCommand):
Implements a Wordpress importer. Takes a file path or a URL for the
Wordpress Extended RSS file.
option_list = BaseImporterCommand.option_list + (
make_option("-u", "--url", dest="url", help="URL to import file"),
def get_text(self, xml, name, nodetype):
Gets the element's text value from the XML object provided.
nodes = xml.getElementsByTagName("wp:comment_" + name)[0].childNodes
return "".join([ for n in nodes if n.nodeType == nodetype])
def handle_import(self, options):
Gets the posts from either the provided URL or the path if it
is local.
url = options.get("url")
if url is None:
raise CommandError("Usage is import_wordpress %s" % self.args)
import feedparser
except ImportError:
raise CommandError("Could not import the feedparser library.")
feed = feedparser.parse(url)
# We use the minidom parser as well because feedparser won't
# interpret WXR comments correctly and ends up munging them.
# xml.dom.minidom is used simply to pull the comments when we
# get to them.
xml = parse(url)
xmlitems = xml.getElementsByTagName("item")
for (i, entry) in enumerate(feed["entries"]):
print i
# Get a pointer to the right position in the minidom as well.
xmlitem = xmlitems[i]
content = linebreaks(self.wp_caption(entry.content[0]["value"]))
# Get the time struct of the published date if possible and
# the updated date if we can't.
pub_date = getattr(entry, "published_parsed", entry.updated_parsed)
pub_date = datetime.fromtimestamp(mktime(pub_date))
pub_date -= timedelta(seconds=timezone)
# Tags and categories are all under "tags" marked with a scheme.
terms = defaultdict(set)
for item in getattr(entry, "tags", []):
if entry['wp_post_type'] == "blog":
print 'blog post'
post = self.add_post(title=entry.title, content=content,
pub_date=pub_date, tags=terms["tag"],
# Get the comments from the xml doc.
for c in xmlitem.getElementsByTagName("wp:comment"):
name = self.get_text(c, "author", c.CDATA_SECTION_NODE)
email = self.get_text(c, "author_email", c.TEXT_NODE)
url = self.get_text(c, "author_url", c.TEXT_NODE)
body = self.get_text(c, "content", c.CDATA_SECTION_NODE)
pub_date = self.get_text(c, "date_gmt", c.TEXT_NODE)
fmt = "%Y-%m-%d %H:%M:%S"
pub_date = datetime.strptime(pub_date, fmt)
pub_date -= timedelta(seconds=timezone)
self.add_comment(post=post, name=name, email=email,
body=body, website=url,
elif entry.wp_post_type == "page":
old_id = getattr(entry, "wp_post_id")
parent_id = getattr(entry, "wp_post_parent")
self.add_page(title=entry.title, content=content,
tags=terms["tag"], old_id=old_id,
def wp_caption(self, post):
Filters a Wordpress Post for Image Captions and renders to
match HTML.
for match in re.finditer(r"\[caption (.*?)\](.*?)\[/caption\]", post):
meta = '<div '
caption = ''
for imatch in re.finditer(r'(\w+)="(.*?)"',
if == 'id':
meta += 'id="%s" ' %
if == 'align':
meta += 'class="wp-caption %s" ' %
if == 'width':
width = int( + 10
meta += 'style="width: %spx;" ' % width
if == 'caption':
caption =
parts = (, caption)
meta += '>%s<p class="wp-caption-text">%s</p></div>' % parts
post = post.replace(, meta)
return post
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment