Skip to content

Instantly share code, notes, and snippets.

@joshcartme
Created May 14, 2015 15:50
Show Gist options
  • Save joshcartme/5f13fc525189b5d22112 to your computer and use it in GitHub Desktop.
Save joshcartme/5f13fc525189b5d22112 to your computer and use it in GitHub Desktop.
Updated Mezzanine import_wordpress.py to handle pub_date conversion error
from __future__ import unicode_literals
from future.builtins import int
from collections import defaultdict
from datetime import datetime, timedelta
from optparse import make_option
import re
from time import mktime, timezone
from xml.dom.minidom import parse
from django.core.management.base import CommandError
from django.utils.html import linebreaks
from theme.management.base import BaseImporterCommand
class Command(BaseImporterCommand):
"""
Implements a Wordpress importer. Takes a file path or a URL for the
Wordpress Extended RSS file.
"""
option_list = BaseImporterCommand.option_list + (
make_option("-u", "--url", dest="url", help="URL to import file"),
)
def get_text(self, xml, name, nodetype):
"""
Gets the element's text value from the XML object provided.
"""
nodes = xml.getElementsByTagName("wp:comment_" + name)[0].childNodes
return "".join([n.data for n in nodes if n.nodeType == nodetype])
def handle_import(self, options):
"""
Gets the posts from either the provided URL or the path if it
is local.
"""
url = options.get("url")
if url is None:
raise CommandError("Usage is import_wordpress %s" % self.args)
try:
import feedparser
except ImportError:
raise CommandError("Could not import the feedparser library.")
feed = feedparser.parse(url)
# We use the minidom parser as well because feedparser won't
# interpret WXR comments correctly and ends up munging them.
# xml.dom.minidom is used simply to pull the comments when we
# get to them.
xml = parse(url)
xmlitems = xml.getElementsByTagName("item")
for (i, entry) in enumerate(feed["entries"]):
print i
# Get a pointer to the right position in the minidom as well.
xmlitem = xmlitems[i]
content = linebreaks(self.wp_caption(entry.content[0]["value"]))
# Get the time struct of the published date if possible and
# the updated date if we can't.
pub_date = getattr(entry, "published_parsed", entry.updated_parsed)
pub_date = datetime.fromtimestamp(mktime(pub_date))
pub_date -= timedelta(seconds=timezone)
# Tags and categories are all under "tags" marked with a scheme.
terms = defaultdict(set)
for item in getattr(entry, "tags", []):
terms[item.scheme].add(item.term)
if entry['wp_post_type'] == "blog":
print 'blog post'
post = self.add_post(title=entry.title, content=content,
pub_date=pub_date, tags=terms["tag"],
categories=terms["category"],
old_url=entry.id, author=entry.author)
# Get the comments from the xml doc.
for c in xmlitem.getElementsByTagName("wp:comment"):
name = self.get_text(c, "author", c.CDATA_SECTION_NODE)
email = self.get_text(c, "author_email", c.TEXT_NODE)
url = self.get_text(c, "author_url", c.TEXT_NODE)
body = self.get_text(c, "content", c.CDATA_SECTION_NODE)
pub_date = self.get_text(c, "date_gmt", c.TEXT_NODE)
fmt = "%Y-%m-%d %H:%M:%S"
pub_date = datetime.strptime(pub_date, fmt)
pub_date -= timedelta(seconds=timezone)
self.add_comment(post=post, name=name, email=email,
body=body, website=url,
pub_date=pub_date)
elif entry.wp_post_type == "page":
old_id = getattr(entry, "wp_post_id")
parent_id = getattr(entry, "wp_post_parent")
self.add_page(title=entry.title, content=content,
tags=terms["tag"], old_id=old_id,
old_parent_id=parent_id)
def wp_caption(self, post):
"""
Filters a Wordpress Post for Image Captions and renders to
match HTML.
"""
for match in re.finditer(r"\[caption (.*?)\](.*?)\[/caption\]", post):
meta = '<div '
caption = ''
for imatch in re.finditer(r'(\w+)="(.*?)"', match.group(1)):
if imatch.group(1) == 'id':
meta += 'id="%s" ' % imatch.group(2)
if imatch.group(1) == 'align':
meta += 'class="wp-caption %s" ' % imatch.group(2)
if imatch.group(1) == 'width':
width = int(imatch.group(2)) + 10
meta += 'style="width: %spx;" ' % width
if imatch.group(1) == 'caption':
caption = imatch.group(2)
parts = (match.group(2), caption)
meta += '>%s<p class="wp-caption-text">%s</p></div>' % parts
post = post.replace(match.group(0), meta)
return post
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment