Skip to content

Instantly share code, notes, and snippets.

Created Mar 3, 2014
What would you like to do?
Dumping wordpress db's exported as xml's into tinkerer
import xml.etree.ElementTree as ET
import subprocess
import unicodedata
import os
#posts = 'wp_posts.xml'
#termrel = 'wp_term_relationships.xml'
#terms = 'wp_terms.xml'
destination_folder = '/Users/sjt/dev/python/tinkertest/input'
def html_to_rst(text):
This function is blatently lifted from John Paulett's blog:
p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return p.communicate(unicodedata.normalize('NFKC', unicode(text)).encode('ascii', 'ignore'))[0]
def export(post_path, terms_path, termrel_path):
terms_xml = ET.parse(terms_path)
termrel_xml = ET.parse(termrel_path)
posts_xml = ET.parse(post_path)
term_lut = create_term_lut(terms_xml, termrel_xml)
posts_data = parse_posts(posts_xml, term_lut)
posts_data.sort(key=lambda x: x['date'])
# create the files
for post in posts_data:
post_filename = os.path.join(destination_folder, post['name'] + '.rst')
print post_filename
post_file_contents = post['title'] + '\n'
post_file_contents += '=' * len(post['title']) + '\n'
post_file_contents += '\n.. author:: default\n'
post_file_contents += '.. categories:: ' + ','.join(post['categories']) + '\n'
post_file_contents += '.. tags:: wp\n'
post_file_contents += '\n' + post['content']
post_date = post['date'].split(' ')[0]
pparts = post_date.split('-')
slash_date = '%s/%s/%s' % (pparts[0], pparts[1], pparts[2])
f = open(post_filename, 'wt')
p =['tinker', '--post', post_filename, '--date', slash_date],
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def create_term_lut(terms_xml, termrel_xml):
terms_dict = {}
terms_db = terms_xml.getroot().find('database')
for table_elem in terms_db:
for column in table_elem:
term_id = table_elem.find(".//column[@name='term_id']").text
term_name = table_elem.find(".//column[@name='name']").text
terms_dict[term_id] = term_name
term_lut = {}
termrel_db = termrel_xml.getroot().find('database')
for table_elem in termrel_db:
post_id = table_elem.find(".//column[@name='object_id']").text
if not post_id in term_lut.keys():
term_lut[post_id] = []
category = terms_dict[table_elem.find(".//column[@name='term_taxonomy_id']").text]
except Exception, err:
print 'got exception:', err
return term_lut
def parse_posts(posts_xml, term_lut):
post_data = []
posts_db = posts_xml.getroot().find('database')
for post_elem in posts_db:
if post_elem.find(".//column[@name='post_type']").text != 'post':
print 'post is not post'
post = {}
post['title'] = post_elem.find(".//column[@name='post_title']").text
if post['title'] == 'Auto Draft':
print 'skipping autodrafts'
post['date'] = post_elem.find(".//column[@name='post_date']").text
post['content'] = html_to_rst(post_elem.find(".//column[@name='post_content']").text)
post_id = post_elem.find(".//column[@name='ID']").text
post['categories'] = term_lut.get(post_id, [])
post['name'] = post_elem.find(".//column[@name='post_name']").text
print post['title'],':',post['name']
return post_data
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Extract data from wp database xml exports')
parser.add_argument('--termrel', dest='term_relationship')
parser.add_argument('--terms', dest='terms')
parser.add_argument('--posts', dest='posts')
args = parser.parse_args()
export(args.posts, args.terms, args.term_relationship)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment