Created
April 1, 2019 21:29
-
-
Save dashengz/5bc0fcd0f2408116fe2f4df93d7986ad to your computer and use it in GitHub Desktop.
Convert Exported Wordpress RSS data to JSON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
import json | |
# configuration: change filename and namespace if needed | |
xml_name = './data.xml' | |
out_name = r'data.json' | |
ns = { | |
'wp': 'http://wordpress.org/export/1.2/', | |
'excerpt': 'http://wordpress.org/export/1.2/excerpt/', | |
'content': 'http://purl.org/rss/1.0/modules/content/', | |
'wfw': 'http://wellformedweb.org/CommentAPI/', | |
'dc': 'http://purl.org/dc/elements/1.1/' | |
} | |
# prepare ns map to replace tag name | |
ns_map = {} | |
for n in ns: | |
pattern = '{' + ns[n] + '}' | |
ns_map[pattern] = n + '_' | |
# parse xml | |
tree = ET.parse(xml_name) | |
root = tree.getroot() | |
# get channel | |
channel = root.find('channel') | |
# site data | |
site = {} | |
def format_ns(tag): | |
""" | |
format tag with namespace url to simplified tag | |
:param tag: | |
:return: | |
""" | |
global ns_map | |
for p in ns_map: | |
if p in tag: | |
return tag.replace(p, ns_map[p]) | |
return tag | |
def add_data(key, value, data): | |
""" | |
add new key value to data, create/append to list if needed | |
:param key: | |
:param value: | |
:param data: | |
:return: | |
""" | |
list_type_keys = [ | |
'wp_author', | |
'item', | |
'category', | |
'wp_postmeta', | |
'wp_comment', | |
'wp_commentmeta' | |
] | |
if key in list_type_keys: | |
if key not in data: | |
data[key] = [] | |
data[key].append(value) | |
else: | |
data[key] = value | |
def iterate(element, data): | |
""" | |
iterate through the tree and add elements to site data | |
:param element: | |
:param data: | |
:return: | |
""" | |
children = list(element) | |
attrs = element.items() | |
tag = format_ns(element.tag) | |
if len(children) == 0 and len(attrs) == 0: | |
# simple tag, add text to data with its tag as key | |
add_data(tag, element.text, data) | |
else: | |
sub_data = {} | |
if len(attrs) != 0: | |
for a in attrs: | |
add_data(a[0], a[1], sub_data) | |
add_data('value', element.text, sub_data) | |
if len(children) != 0: | |
for c in children: | |
iterate(c, sub_data) | |
add_data(tag, sub_data, data) | |
# iterate through channel to get site data | |
iterate(channel, site) | |
# write json to file | |
f = open(out_name, 'w') | |
f.write(json.dumps(site)) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment