Skip to content

Instantly share code, notes, and snippets.

@rhortal
Forked from ruslanosipov/wxr2txt.py
Last active February 2, 2024 15:59
Show Gist options
  • Save rhortal/9728ef872e75a00b804e049f3cc2bd77 to your computer and use it in GitHub Desktop.
Save rhortal/9728ef872e75a00b804e049f3cc2bd77 to your computer and use it in GitHub Desktop.
Script to convert WordPress posts to plain text files. Works with Python 3.x and strips out HTML.
#!/usr/bin/env python3
"""This script converts WXR file to a number of plain text files.
WXR stands for "WordPress eXtended RSS", which basically is just a
regular XML file. This script extracts entries from the WXR file into
plain text files. Output format: article name prefixed by date for
posts, article name for pages.
Usage: wxr2txt.py filename [-o output_dir]
"""
import os
import re
import sys
from xml.etree import ElementTree
from bs4 import BeautifulSoup
NAMESPACES = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
}
USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]"
def main(argv):
filename, output_dir = _parse_and_validate_output(argv)
try:
data = ElementTree.parse(filename).getroot()
except ElementTree.ParseError:
_error("Invalid input file format. Can not parse the input.")
page_counter, post_counter = 0, 0
for post in data.find('channel').findall('item'):
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
if post_type not in ('post', 'page'):
continue
content = post.find('content:encoded', namespaces=NAMESPACES).text
date = post.find('wp:post_date', namespaces=NAMESPACES).text
title = post.find('title').text
date = date.split(' ')[0].replace('-', '')
title = re.sub(r'[_]+', '_', re.sub(r'[^a-z0-9+]', '_', title.lower()))
if post_type == 'post':
post_filename = date + '_' + title + '.txt'
post_counter += 1
else:
post_filename = title + '.txt'
page_counter += 1
content = BeautifulSoup(content, "html.parser").get_text()
with open(os.path.join(output_dir, post_filename), 'w') as post_file:
post_file.write(content)
post_counter += 1
print("Saved {} posts and {} pages in directory '{}'.".format(
post_counter, page_counter, output_dir))
def _parse_and_validate_output(argv):
if len(argv) not in (2, 4):
_error("Wrong number of arguments.")
filename = argv[1]
if not os.path.isfile(filename):
_error("Input file does not exist (or not enough permissions).")
if len(argv) == 4 and argv[2] == '-o':
output_dir = argv[3]
else:
output_dir = os.getcwd()
if not os.path.isdir(output_dir):
_error("Output directory does not exist (or not enough permissions).")
return filename, output_dir
def _error(text):
print (text)
print (USAGE_STRING)
sys.exit(1)
if __name__ == "__main__":
main(sys.argv)
@rhortal
Copy link
Author

rhortal commented Feb 2, 2024

Based on https://gist.github.com/ruslanosipov/b748a138389db2cda1e8.
Updated to work with modern Python and added the use of BeautifulSoup's HTML parser to extract only the text of the posts.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment