svenni/gist:9315949

## gistfile1.txt
import xml.etree.ElementTree as ET
import subprocess
import unicodedata
import os

#posts = 'wp_posts.xml'
#termrel = 'wp_term_relationships.xml'
#terms = 'wp_terms.xml'

destination_folder = '/Users/sjt/dev/python/tinkertest/input'


def html_to_rst(text):
    '''
    This function is blatently lifted from John Paulett's blog:
    http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/

    '''

    p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
                         stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    return p.communicate(unicodedata.normalize('NFKC', unicode(text)).encode('ascii', 'ignore'))[0]


def export(post_path, terms_path, termrel_path):
    terms_xml = ET.parse(terms_path)
    termrel_xml = ET.parse(termrel_path)
    posts_xml = ET.parse(post_path)

    term_lut = create_term_lut(terms_xml, termrel_xml)

    posts_data = parse_posts(posts_xml, term_lut)
    posts_data.sort(key=lambda x: x['date'])
    # create the files
    for post in posts_data:
        post_filename = os.path.join(destination_folder, post['name'] + '.rst')
        print post_filename
        post_file_contents = post['title'] + '\n'
        post_file_contents += '=' * len(post['title']) + '\n'
        post_file_contents += '\n.. author:: default\n'
        post_file_contents += '.. categories:: ' + ','.join(post['categories']) + '\n'
        post_file_contents += '.. tags:: wp\n'
        post_file_contents += '\n' + post['content']

        post_date = post['date'].split(' ')[0]
        pparts = post_date.split('-')
        slash_date = '%s/%s/%s' % (pparts[0], pparts[1], pparts[2])

        #continue
        f = open(post_filename, 'wt')
        f.write(post_file_contents)
        f.close()

        p = subprocess.call(['tinker', '--post', post_filename, '--date', slash_date],
                              stdin=subprocess.PIPE, stdout=subprocess.PIPE)


def create_term_lut(terms_xml, termrel_xml):
    terms_dict = {}
    terms_db = terms_xml.getroot().find('database')
    for table_elem in terms_db:
        for column in table_elem:
            term_id = table_elem.find(".//column[@name='term_id']").text
            term_name = table_elem.find(".//column[@name='name']").text
            terms_dict[term_id] = term_name

    term_lut = {}
    termrel_db = termrel_xml.getroot().find('database')
    for table_elem in termrel_db:
        post_id = table_elem.find(".//column[@name='object_id']").text
        if not post_id in term_lut.keys():
            term_lut[post_id] = []

        try:
            category = terms_dict[table_elem.find(".//column[@name='term_taxonomy_id']").text]
        except Exception, err:
            print 'got exception:', err
            continue

        term_lut[post_id].append(category)

    return term_lut


def parse_posts(posts_xml, term_lut):
    post_data = []
    posts_db = posts_xml.getroot().find('database')
    for post_elem in posts_db:
        if post_elem.find(".//column[@name='post_type']").text != 'post':
            print 'post is not post'
            continue
        post = {}
        post['title'] = post_elem.find(".//column[@name='post_title']").text
        if post['title'] == 'Auto Draft':
            print 'skipping autodrafts'
            continue
        post['date'] = post_elem.find(".//column[@name='post_date']").text
        post['content'] = html_to_rst(post_elem.find(".//column[@name='post_content']").text)
        post_id = post_elem.find(".//column[@name='ID']").text
        post['categories'] = term_lut.get(post_id, [])
        post['name'] = post_elem.find(".//column[@name='post_name']").text
        print post['title'],':',post['name']
        post_data.append(post)
    return post_data


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Extract data from wp database xml exports')
    parser.add_argument('--termrel', dest='term_relationship')
    parser.add_argument('--terms', dest='terms')
    parser.add_argument('--posts', dest='posts')

    args = parser.parse_args()


    export(args.posts, args.terms, args.term_relationship)
	import xml.etree.ElementTree as ET
	import subprocess
	import unicodedata
	import os

	#posts = 'wp_posts.xml'
	#termrel = 'wp_term_relationships.xml'
	#terms = 'wp_terms.xml'

	destination_folder = '/Users/sjt/dev/python/tinkertest/input'


	def html_to_rst(text):
	'''
	This function is blatently lifted from John Paulett's blog:
	http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/

	'''

	p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
	stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	return p.communicate(unicodedata.normalize('NFKC', unicode(text)).encode('ascii', 'ignore'))[0]


	def export(post_path, terms_path, termrel_path):
	terms_xml = ET.parse(terms_path)
	termrel_xml = ET.parse(termrel_path)
	posts_xml = ET.parse(post_path)

	term_lut = create_term_lut(terms_xml, termrel_xml)

	posts_data = parse_posts(posts_xml, term_lut)
	posts_data.sort(key=lambda x: x['date'])
	# create the files
	for post in posts_data:
	post_filename = os.path.join(destination_folder, post['name'] + '.rst')
	print post_filename
	post_file_contents = post['title'] + '\n'
	post_file_contents += '=' * len(post['title']) + '\n'
	post_file_contents += '\n.. author:: default\n'
	post_file_contents += '.. categories:: ' + ','.join(post['categories']) + '\n'
	post_file_contents += '.. tags:: wp\n'
	post_file_contents += '\n' + post['content']

	post_date = post['date'].split(' ')[0]
	pparts = post_date.split('-')
	slash_date = '%s/%s/%s' % (pparts[0], pparts[1], pparts[2])

	#continue
	f = open(post_filename, 'wt')
	f.write(post_file_contents)
	f.close()

	p = subprocess.call(['tinker', '--post', post_filename, '--date', slash_date],
	stdin=subprocess.PIPE, stdout=subprocess.PIPE)



	def create_term_lut(terms_xml, termrel_xml):
	terms_dict = {}
	terms_db = terms_xml.getroot().find('database')
	for table_elem in terms_db:
	for column in table_elem:
	term_id = table_elem.find(".//column[@name='term_id']").text
	term_name = table_elem.find(".//column[@name='name']").text
	terms_dict[term_id] = term_name

	term_lut = {}
	termrel_db = termrel_xml.getroot().find('database')
	for table_elem in termrel_db:
	post_id = table_elem.find(".//column[@name='object_id']").text
	if not post_id in term_lut.keys():
	term_lut[post_id] = []

	try:
	category = terms_dict[table_elem.find(".//column[@name='term_taxonomy_id']").text]
	except Exception, err:
	print 'got exception:', err
	continue

	term_lut[post_id].append(category)

	return term_lut


	def parse_posts(posts_xml, term_lut):
	post_data = []
	posts_db = posts_xml.getroot().find('database')
	for post_elem in posts_db:
	if post_elem.find(".//column[@name='post_type']").text != 'post':
	print 'post is not post'
	continue
	post = {}
	post['title'] = post_elem.find(".//column[@name='post_title']").text
	if post['title'] == 'Auto Draft':
	print 'skipping autodrafts'
	continue
	post['date'] = post_elem.find(".//column[@name='post_date']").text
	post['content'] = html_to_rst(post_elem.find(".//column[@name='post_content']").text)
	post_id = post_elem.find(".//column[@name='ID']").text
	post['categories'] = term_lut.get(post_id, [])
	post['name'] = post_elem.find(".//column[@name='post_name']").text
	print post['title'],':',post['name']
	post_data.append(post)
	return post_data



	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser(description='Extract data from wp database xml exports')
	parser.add_argument('--termrel', dest='term_relationship')
	parser.add_argument('--terms', dest='terms')
	parser.add_argument('--posts', dest='posts')

	args = parser.parse_args()



	export(args.posts, args.terms, args.term_relationship)