Skip to content

Instantly share code, notes, and snippets.

@aegis1980
Forked from ruslanosipov/wxr2txt.py
Last active August 22, 2017 06:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aegis1980/4d00c381b0eb67f83cf93365cd7b69ad to your computer and use it in GitHub Desktop.
Save aegis1980/4d00c381b0eb67f83cf93365cd7b69ad to your computer and use it in GitHub Desktop.
Script to convert WordPress posts to plain text files
#!/usr/bin/env python
"""This script converts Wordpress WXR file to a number of plain text files.
WXR stands for "WordPress eXtended RSS", which basically is just a
regular XML file. This script extracts entries from the WXR file into
plain text files. Output format: article name prefixed by date for
posts, article name for pages.
Usage: wxr2txt.py filename [-o output_dir]
This fork of the original extracts just the prose content, without headings (some) shortcodes - you will have to add to that.
It then breaks down content to a sentence by line basis.
"""
import os
import sys
from xml.etree import ElementTree
from html.parser import HTMLParser
NAMESPACES = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
}
USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]"
def main(argv):
filename, output_dir = _parse_and_validate_output(argv)
try:
data = ElementTree.parse(filename).getroot()
except ElementTree.ParseError:
_error("Invalid input file format. Can not parse the input.")
page_counter, post_counter = 0, 0
all_text_file = open(os.path.join(output_dir, "all.txt"), 'wb')
for post in data.find('channel').findall('item'):
post_type = post.find('wp:post_type', NAMESPACES).text
if post_type not in ('post', 'page'):
continue
content = post.find('content:encoded', NAMESPACES).text
if post_type == 'post':
post_counter += 1
else:
page_counter += 1
content = strip_tags(content)
content = split_lines(content)
all_text_file.write(content.encode('utf8'))
post_counter += 1
print("Saved {} posts and {} pages in directory {}".format(
post_counter, page_counter, output_dir))
def _parse_and_validate_output(argv):
if len(argv) not in (2, 4):
_error("Wrong number of arguments.")
filename = argv[1]
if not os.path.isfile(filename):
_error("Input file does not exist (or not enough permissions).")
output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
if not os.path.isdir(output_dir):
_error("Output directory does not exist (or not enough permissions).")
return filename, output_dir
def _error(text):
print(text)
print(USAGE_STRING)
sys.exit(1)
class BodyStripper(HTMLParser):
def error(self, message):
pass
def __init__(self):
super().__init__()
self.reset()
self.fed = []
self.in_heading = False
def feed(self, data):
data = data.replace("<![CDATA[", "").replace("]]", "")
data = data.replace('[caption', '<h5>').replace("[/caption]", "</h5>")
data = data.replace('[stextbox', '<h5>').replace("[/stextbox]", " </h5>")
super().feed(data)
def handle_starttag(self, tag, attrs):
if tag.startswith("h"):
self.in_heading = True
def handle_endtag(self, tag):
if tag.startswith("h"):
self.in_heading = False
def handle_data(self, d):
if self.in_heading == False:
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip(self):
self.fed = ' '.join(self.fed)
self.fed = self.fed.replace('\n', ' ').replace('\r', '')
self.fed = ' '.join(self.fed.split())
def strip_tags(html):
s = BodyStripper()
s.feed(html)
s.strip()
return s.get_data()
def split_lines(text):
return '.\n'.join(text.split('.'))
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment