Skip to content

Instantly share code, notes, and snippets.

@chrono-meter
Created December 28, 2018 05:32
Show Gist options
  • Save chrono-meter/8c9b308b9af275ae1537229386844f98 to your computer and use it in GitHub Desktop.
Save chrono-meter/8c9b308b9af275ae1537229386844f98 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from xml.etree import ElementTree
from pathlib import Path
import io
import datetime
import email.utils
__author__ = 'chrono-meter@gmx.net'
__version__ = '1.0.0'
__license__ = 'Python Software Foundation License'
__url__ = 'https://gist.github.com/chrono-meter/8c9b308b9af275ae1537229386844f98'
def parse_items(wxr):
"""Parse WordPress eXtended Rss file.
:param wxr: os.PathLike or file-like or xml string or ElementTree object.
:rtype: Iterator[:class:`dict`]
"""
if isinstance(wxr, (Path, io.BufferedIOBase, io.TextIOBase)):
root = ElementTree.parse(wxr).getroot()
elif isinstance(wxr, str):
root = ElementTree.fromstring(wxr).getroot()
elif isinstance(wxr, ElementTree.ElementTree):
root = wxr.getroot()
elif isinstance(wxr, ElementTree.Element):
root = wxr
else:
raise TypeError('Unsupported object', wxr)
for item in root.iterfind('channel/item'):
data = {'taxonomy': {}, 'postmeta': {}}
for node in item:
if node.tag == 'category':
data['taxonomy'].setdefault(node.attrib['domain'], []).append({
'slug': node.attrib['nicename'],
'title': node.text,
})
elif node.tag == '{http://wordpress.org/export/1.2/}postmeta':
data['postmeta'][node.find('{http://wordpress.org/export/1.2/}meta_key').text] = \
node.find('{http://wordpress.org/export/1.2/}meta_value').text
else:
key = node.tag.split('}')[1] if node.tag.startswith('{') else node.tag
value = node.text or ''
if node.tag == '{http://purl.org/rss/1.0/modules/content/}encoded':
key = 'content'
elif node.tag == '{http://wordpress.org/export/1.2/excerpt/}encoded':
key = 'excerpt'
elif node.tag == 'pubDate':
# <pubDate>Tue, 12 Apr 2016 06:12:22 +0000</pubDate>
assert value.endswith(' +0000')
value = datetime.datetime(*email.utils.parsedate(value)[:6])
elif node.tag == '{http://wordpress.org/export/1.2/}post_date' \
or node.tag == '{http://wordpress.org/export/1.2/}post_date_gmt':
# <wp:post_date>2016-04-12 15:12:22</wp:post_date>
# <wp:post_date_gmt>2016-04-12 06:12:22</wp:post_date_gmt>
if node.tag == '{http://wordpress.org/export/1.2/}post_date_gmt' \
and value == '0000-00-00 00:00:00':
value = datetime.datetime.fromtimestamp(0)
else:
value = datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
assert key not in data, 'unsupported feature'
data[key] = value
yield data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment