Created
December 28, 2018 05:32
-
-
Save chrono-meter/8c9b308b9af275ae1537229386844f98 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from xml.etree import ElementTree | |
from pathlib import Path | |
import io | |
import datetime | |
import email.utils | |
__author__ = 'chrono-meter@gmx.net' | |
__version__ = '1.0.0' | |
__license__ = 'Python Software Foundation License' | |
__url__ = 'https://gist.github.com/chrono-meter/8c9b308b9af275ae1537229386844f98' | |
def parse_items(wxr): | |
"""Parse WordPress eXtended Rss file. | |
:param wxr: os.PathLike or file-like or xml string or ElementTree object. | |
:rtype: Iterator[:class:`dict`] | |
""" | |
if isinstance(wxr, (Path, io.BufferedIOBase, io.TextIOBase)): | |
root = ElementTree.parse(wxr).getroot() | |
elif isinstance(wxr, str): | |
root = ElementTree.fromstring(wxr).getroot() | |
elif isinstance(wxr, ElementTree.ElementTree): | |
root = wxr.getroot() | |
elif isinstance(wxr, ElementTree.Element): | |
root = wxr | |
else: | |
raise TypeError('Unsupported object', wxr) | |
for item in root.iterfind('channel/item'): | |
data = {'taxonomy': {}, 'postmeta': {}} | |
for node in item: | |
if node.tag == 'category': | |
data['taxonomy'].setdefault(node.attrib['domain'], []).append({ | |
'slug': node.attrib['nicename'], | |
'title': node.text, | |
}) | |
elif node.tag == '{http://wordpress.org/export/1.2/}postmeta': | |
data['postmeta'][node.find('{http://wordpress.org/export/1.2/}meta_key').text] = \ | |
node.find('{http://wordpress.org/export/1.2/}meta_value').text | |
else: | |
key = node.tag.split('}')[1] if node.tag.startswith('{') else node.tag | |
value = node.text or '' | |
if node.tag == '{http://purl.org/rss/1.0/modules/content/}encoded': | |
key = 'content' | |
elif node.tag == '{http://wordpress.org/export/1.2/excerpt/}encoded': | |
key = 'excerpt' | |
elif node.tag == 'pubDate': | |
# <pubDate>Tue, 12 Apr 2016 06:12:22 +0000</pubDate> | |
assert value.endswith(' +0000') | |
value = datetime.datetime(*email.utils.parsedate(value)[:6]) | |
elif node.tag == '{http://wordpress.org/export/1.2/}post_date' \ | |
or node.tag == '{http://wordpress.org/export/1.2/}post_date_gmt': | |
# <wp:post_date>2016-04-12 15:12:22</wp:post_date> | |
# <wp:post_date_gmt>2016-04-12 06:12:22</wp:post_date_gmt> | |
if node.tag == '{http://wordpress.org/export/1.2/}post_date_gmt' \ | |
and value == '0000-00-00 00:00:00': | |
value = datetime.datetime.fromtimestamp(0) | |
else: | |
value = datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S") | |
assert key not in data, 'unsupported feature' | |
data[key] = value | |
yield data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment