Created
March 3, 2014 00:02
-
-
Save svenni/9315949 to your computer and use it in GitHub Desktop.
Dumping wordpress db's exported as xml's into tinkerer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
import subprocess | |
import unicodedata | |
import os | |
#posts = 'wp_posts.xml' | |
#termrel = 'wp_term_relationships.xml' | |
#terms = 'wp_terms.xml' | |
destination_folder = '/Users/sjt/dev/python/tinkertest/input' | |
def html_to_rst(text): | |
''' | |
This function is blatently lifted from John Paulett's blog: | |
http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/ | |
''' | |
p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'], | |
stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
return p.communicate(unicodedata.normalize('NFKC', unicode(text)).encode('ascii', 'ignore'))[0] | |
def export(post_path, terms_path, termrel_path): | |
terms_xml = ET.parse(terms_path) | |
termrel_xml = ET.parse(termrel_path) | |
posts_xml = ET.parse(post_path) | |
term_lut = create_term_lut(terms_xml, termrel_xml) | |
posts_data = parse_posts(posts_xml, term_lut) | |
posts_data.sort(key=lambda x: x['date']) | |
# create the files | |
for post in posts_data: | |
post_filename = os.path.join(destination_folder, post['name'] + '.rst') | |
print post_filename | |
post_file_contents = post['title'] + '\n' | |
post_file_contents += '=' * len(post['title']) + '\n' | |
post_file_contents += '\n.. author:: default\n' | |
post_file_contents += '.. categories:: ' + ','.join(post['categories']) + '\n' | |
post_file_contents += '.. tags:: wp\n' | |
post_file_contents += '\n' + post['content'] | |
post_date = post['date'].split(' ')[0] | |
pparts = post_date.split('-') | |
slash_date = '%s/%s/%s' % (pparts[0], pparts[1], pparts[2]) | |
#continue | |
f = open(post_filename, 'wt') | |
f.write(post_file_contents) | |
f.close() | |
p = subprocess.call(['tinker', '--post', post_filename, '--date', slash_date], | |
stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
def create_term_lut(terms_xml, termrel_xml): | |
terms_dict = {} | |
terms_db = terms_xml.getroot().find('database') | |
for table_elem in terms_db: | |
for column in table_elem: | |
term_id = table_elem.find(".//column[@name='term_id']").text | |
term_name = table_elem.find(".//column[@name='name']").text | |
terms_dict[term_id] = term_name | |
term_lut = {} | |
termrel_db = termrel_xml.getroot().find('database') | |
for table_elem in termrel_db: | |
post_id = table_elem.find(".//column[@name='object_id']").text | |
if not post_id in term_lut.keys(): | |
term_lut[post_id] = [] | |
try: | |
category = terms_dict[table_elem.find(".//column[@name='term_taxonomy_id']").text] | |
except Exception, err: | |
print 'got exception:', err | |
continue | |
term_lut[post_id].append(category) | |
return term_lut | |
def parse_posts(posts_xml, term_lut): | |
post_data = [] | |
posts_db = posts_xml.getroot().find('database') | |
for post_elem in posts_db: | |
if post_elem.find(".//column[@name='post_type']").text != 'post': | |
print 'post is not post' | |
continue | |
post = {} | |
post['title'] = post_elem.find(".//column[@name='post_title']").text | |
if post['title'] == 'Auto Draft': | |
print 'skipping autodrafts' | |
continue | |
post['date'] = post_elem.find(".//column[@name='post_date']").text | |
post['content'] = html_to_rst(post_elem.find(".//column[@name='post_content']").text) | |
post_id = post_elem.find(".//column[@name='ID']").text | |
post['categories'] = term_lut.get(post_id, []) | |
post['name'] = post_elem.find(".//column[@name='post_name']").text | |
print post['title'],':',post['name'] | |
post_data.append(post) | |
return post_data | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser(description='Extract data from wp database xml exports') | |
parser.add_argument('--termrel', dest='term_relationship') | |
parser.add_argument('--terms', dest='terms') | |
parser.add_argument('--posts', dest='posts') | |
args = parser.parse_args() | |
export(args.posts, args.terms, args.term_relationship) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment