Skip to content

Instantly share code, notes, and snippets.

@aharonium

aharonium/wxr2txt.py

Forked from ruslanosipov/wxr2txt.py
Last active Jul 19, 2019
Embed
What would you like to do?
Script to convert WordPress posts to plain text files
#!/usr/bin/env python
"""This script converts WXR file to a number of plain text files.
WXR stands for "WordPress eXtended RSS", which basically is just a
regular XML file. This script extracts entries from the WXR file into
plain text files. Output format: article name prefixed by date for
posts, article name for pages.
Usage: wxr2txt.py filename [-o output_dir]
"""
import os
import re
import sys
from xml.etree import ElementTree
from urlparse import urlparse
NAMESPACES = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
'wfw': 'http://wellformedweb.org/CommentAPI/',
'dc': 'http://purl.org/dc/elements/1.1/',
}
USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]" ''' note: output argument no longer works '''
def main(argv):
filename, output_dir = _parse_and_validate_output(argv)
try:
data = ElementTree.parse(filename).getroot()
except ElementTree.ParseError:
_error("Invalid input file format. Can not parse the input.")
page_counter, post_counter = 0, 0
cwd = os.getcwd()
for post in data.find('channel').findall('item'):
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
if post_type not in ('post', 'page'):
continue
content = post.find('content:encoded', namespaces=NAMESPACES).text
date = post.find('wp:post_date', namespaces=NAMESPACES).text
postid = post.find('wp:post_id', namespaces=NAMESPACES).text
author = post.find('dc:creator', namespaces=NAMESPACES).text
title = post.find('title').text
linktitle = post.find('link').text
url = post.find('link').text
''' date = date.split(' ')[0].replace('-', '') '''
date = date.split(' ')[0]
postidlink = 'http://opensiddur.org/?p=' + postid
linktitle = re.sub(r'^https:\/.*\/(.+)\/$', r'\1', linktitle)
url = re.sub(r'^https:\/(.*\/).+\/$', r'\1', url)
title = title.encode('utf8')
fullname = os.path.join(cwd, url)
path, basename = os.path.split(fullname)
if not os.path.exists(path):
os.makedirs(path)
if post_type == 'post':
''' post_filename = linktitle + '_(' + author + '_' + date + ').html.md' '''
post_filename = linktitle + '.html.md'
post_counter += 1
else:
post_filename = linktitle + '.html'
page_counter += 1
''' with open(os.path.join(output_dir, post_filename), 'w') as post_file: '''
with open(os.path.join(fullname, post_filename), 'w') as post_file:
post_file.write('<html>\n<head></head>\n<body>\nTitle: ' + title + '<br />\n')
post_file.write('Primary contributor: ' + author + '<br />\n')
''' post_file.write('Sharing terms: ' + license + '<br />\n') '''
post_file.write('For attribution and license, please consult the following URL: <a href="' + postidlink + '">' + postidlink + '</a>\n<p />\n<hr />\n\n')
post_file.write(content.encode('utf8'))
post_file.write('\n</body>\n</html>')
post_counter += 1
print "Saved {} posts and {} pages in directory '{}'.".format(
post_counter, page_counter, output_dir)
def _parse_and_validate_output(argv):
if len(argv) not in (2, 4):
_error("Wrong number of arguments.")
filename = argv[1]
if not os.path.isfile(filename):
_error("Input file does not exist (or not enough permissions).")
output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
if not os.path.isdir(output_dir):
_error("Output directory does not exist (or not enough permissions).")
return filename, output_dir
def _error(text):
print text
print USAGE_STRING
sys.exit(1)
if __name__ == "__main__":
main(sys.argv)
@aharonium

This comment has been minimized.

Copy link
Owner Author

@aharonium aharonium commented Jul 16, 2019

This fork breaks the output directory option that ruslanosipov had in his original script. The script will create a directory tree of wordpress posts based on the post categories and sub-categories indicated in their permalink.

I still want to figure out how I can parse elements with multiple values (categories, tags, co-authors) as well as postmeta info. Please help if you can.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.