Skip to content

Instantly share code, notes, and snippets.

@svenni
Created March 3, 2014 00:02
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save svenni/9315949 to your computer and use it in GitHub Desktop.
Dumping wordpress db's exported as xml's into tinkerer
import xml.etree.ElementTree as ET
import subprocess
import unicodedata
import os
#posts = 'wp_posts.xml'
#termrel = 'wp_term_relationships.xml'
#terms = 'wp_terms.xml'
destination_folder = '/Users/sjt/dev/python/tinkertest/input'
def html_to_rst(text):
'''
This function is blatently lifted from John Paulett's blog:
http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/
'''
p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return p.communicate(unicodedata.normalize('NFKC', unicode(text)).encode('ascii', 'ignore'))[0]
def export(post_path, terms_path, termrel_path):
terms_xml = ET.parse(terms_path)
termrel_xml = ET.parse(termrel_path)
posts_xml = ET.parse(post_path)
term_lut = create_term_lut(terms_xml, termrel_xml)
posts_data = parse_posts(posts_xml, term_lut)
posts_data.sort(key=lambda x: x['date'])
# create the files
for post in posts_data:
post_filename = os.path.join(destination_folder, post['name'] + '.rst')
print post_filename
post_file_contents = post['title'] + '\n'
post_file_contents += '=' * len(post['title']) + '\n'
post_file_contents += '\n.. author:: default\n'
post_file_contents += '.. categories:: ' + ','.join(post['categories']) + '\n'
post_file_contents += '.. tags:: wp\n'
post_file_contents += '\n' + post['content']
post_date = post['date'].split(' ')[0]
pparts = post_date.split('-')
slash_date = '%s/%s/%s' % (pparts[0], pparts[1], pparts[2])
#continue
f = open(post_filename, 'wt')
f.write(post_file_contents)
f.close()
p = subprocess.call(['tinker', '--post', post_filename, '--date', slash_date],
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def create_term_lut(terms_xml, termrel_xml):
terms_dict = {}
terms_db = terms_xml.getroot().find('database')
for table_elem in terms_db:
for column in table_elem:
term_id = table_elem.find(".//column[@name='term_id']").text
term_name = table_elem.find(".//column[@name='name']").text
terms_dict[term_id] = term_name
term_lut = {}
termrel_db = termrel_xml.getroot().find('database')
for table_elem in termrel_db:
post_id = table_elem.find(".//column[@name='object_id']").text
if not post_id in term_lut.keys():
term_lut[post_id] = []
try:
category = terms_dict[table_elem.find(".//column[@name='term_taxonomy_id']").text]
except Exception, err:
print 'got exception:', err
continue
term_lut[post_id].append(category)
return term_lut
def parse_posts(posts_xml, term_lut):
post_data = []
posts_db = posts_xml.getroot().find('database')
for post_elem in posts_db:
if post_elem.find(".//column[@name='post_type']").text != 'post':
print 'post is not post'
continue
post = {}
post['title'] = post_elem.find(".//column[@name='post_title']").text
if post['title'] == 'Auto Draft':
print 'skipping autodrafts'
continue
post['date'] = post_elem.find(".//column[@name='post_date']").text
post['content'] = html_to_rst(post_elem.find(".//column[@name='post_content']").text)
post_id = post_elem.find(".//column[@name='ID']").text
post['categories'] = term_lut.get(post_id, [])
post['name'] = post_elem.find(".//column[@name='post_name']").text
print post['title'],':',post['name']
post_data.append(post)
return post_data
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Extract data from wp database xml exports')
parser.add_argument('--termrel', dest='term_relationship')
parser.add_argument('--terms', dest='terms')
parser.add_argument('--posts', dest='posts')
args = parser.parse_args()
export(args.posts, args.terms, args.term_relationship)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment