aegis1980/wxr2txt.py

## wxr2txt.py
#!/usr/bin/env python

"""This script converts Wordpress WXR file to a number of plain text files.

WXR stands for "WordPress eXtended RSS", which basically is just a
regular XML file. This script extracts entries from the WXR file into
plain text files. Output format: article name prefixed by date for
posts, article name for pages.

Usage: wxr2txt.py filename [-o output_dir]

This fork of the original extracts just the prose content, without headings (some) shortcodes - you will have to add to that.
It then breaks down content to a sentence by line basis.


"""

import os
import sys
from xml.etree import ElementTree
from html.parser import HTMLParser

NAMESPACES = {
    'content': 'http://purl.org/rss/1.0/modules/content/',
    'wp': 'http://wordpress.org/export/1.2/',
}
USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]"


def main(argv):
    filename, output_dir = _parse_and_validate_output(argv)
    try:
        data = ElementTree.parse(filename).getroot()
    except ElementTree.ParseError:
        _error("Invalid input file format. Can not parse the input.")
    page_counter, post_counter = 0, 0
    all_text_file = open(os.path.join(output_dir, "all.txt"), 'wb')
    for post in data.find('channel').findall('item'):
        post_type = post.find('wp:post_type', NAMESPACES).text
        if post_type not in ('post', 'page'):
            continue
        content = post.find('content:encoded', NAMESPACES).text
        if post_type == 'post':
            post_counter += 1
        else:
            page_counter += 1
        content = strip_tags(content)
        content = split_lines(content)
        all_text_file.write(content.encode('utf8'))
        post_counter += 1
    print("Saved {} posts and {} pages in directory {}".format(
        post_counter, page_counter, output_dir))


def _parse_and_validate_output(argv):
    if len(argv) not in (2, 4):
        _error("Wrong number of arguments.")
    filename = argv[1]
    if not os.path.isfile(filename):
        _error("Input file does not exist (or not enough permissions).")
    output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
    if not os.path.isdir(output_dir):
        _error("Output directory does not exist (or not enough permissions).")
    return filename, output_dir


def _error(text):
    print(text)
    print(USAGE_STRING)
    sys.exit(1)


class BodyStripper(HTMLParser):

    def error(self, message):
        pass

    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
        self.in_heading = False

    def feed(self, data):
        data = data.replace("<![CDATA[", "").replace("]]", "")
        data = data.replace('[caption', '<h5>').replace("[/caption]", "</h5>")
        data = data.replace('[stextbox', '<h5>').replace("[/stextbox]", " </h5>")
        super().feed(data)

    def handle_starttag(self, tag, attrs):
        if tag.startswith("h"):
            self.in_heading = True

    def handle_endtag(self, tag):
        if tag.startswith("h"):
            self.in_heading = False

    def handle_data(self, d):
        if self.in_heading == False:
            self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

    def strip(self):
        self.fed = ' '.join(self.fed)
        self.fed = self.fed.replace('\n', ' ').replace('\r', '')
        self.fed = ' '.join(self.fed.split())


def strip_tags(html):
    s = BodyStripper()
    s.feed(html)
    s.strip()
    return s.get_data()


def split_lines(text):
    return '.\n'.join(text.split('.'))


if __name__ == "__main__":
    main(sys.argv)
	#!/usr/bin/env python

	"""This script converts Wordpress WXR file to a number of plain text files.

	WXR stands for "WordPress eXtended RSS", which basically is just a
	regular XML file. This script extracts entries from the WXR file into
	plain text files. Output format: article name prefixed by date for
	posts, article name for pages.

	Usage: wxr2txt.py filename [-o output_dir]

	This fork of the original extracts just the prose content, without headings (some) shortcodes - you will have to add to that.
	It then breaks down content to a sentence by line basis.


	"""

	import os
	import sys
	from xml.etree import ElementTree
	from html.parser import HTMLParser

	NAMESPACES = {
	'content': 'http://purl.org/rss/1.0/modules/content/',
	'wp': 'http://wordpress.org/export/1.2/',
	}
	USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]"


	def main(argv):
	filename, output_dir = _parse_and_validate_output(argv)
	try:
	data = ElementTree.parse(filename).getroot()
	except ElementTree.ParseError:
	_error("Invalid input file format. Can not parse the input.")
	page_counter, post_counter = 0, 0
	all_text_file = open(os.path.join(output_dir, "all.txt"), 'wb')
	for post in data.find('channel').findall('item'):
	post_type = post.find('wp:post_type', NAMESPACES).text
	if post_type not in ('post', 'page'):
	continue
	content = post.find('content:encoded', NAMESPACES).text
	if post_type == 'post':
	post_counter += 1
	else:
	page_counter += 1
	content = strip_tags(content)
	content = split_lines(content)
	all_text_file.write(content.encode('utf8'))
	post_counter += 1
	print("Saved {} posts and {} pages in directory {}".format(
	post_counter, page_counter, output_dir))


	def _parse_and_validate_output(argv):
	if len(argv) not in (2, 4):
	_error("Wrong number of arguments.")
	filename = argv[1]
	if not os.path.isfile(filename):
	_error("Input file does not exist (or not enough permissions).")
	output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
	if not os.path.isdir(output_dir):
	_error("Output directory does not exist (or not enough permissions).")
	return filename, output_dir


	def _error(text):
	print(text)
	print(USAGE_STRING)
	sys.exit(1)


	class BodyStripper(HTMLParser):

	def error(self, message):
	pass

	def __init__(self):
	super().__init__()
	self.reset()
	self.fed = []
	self.in_heading = False

	def feed(self, data):
	data = data.replace("<![CDATA[", "").replace("]]", "")
	data = data.replace('[caption', '<h5>').replace("[/caption]", "</h5>")
	data = data.replace('[stextbox', '<h5>').replace("[/stextbox]", " </h5>")
	super().feed(data)

	def handle_starttag(self, tag, attrs):
	if tag.startswith("h"):
	self.in_heading = True

	def handle_endtag(self, tag):
	if tag.startswith("h"):
	self.in_heading = False

	def handle_data(self, d):
	if self.in_heading == False:
	self.fed.append(d)

	def get_data(self):
	return ''.join(self.fed)

	def strip(self):
	self.fed = ' '.join(self.fed)
	self.fed = self.fed.replace('\n', ' ').replace('\r', '')
	self.fed = ' '.join(self.fed.split())


	def strip_tags(html):
	s = BodyStripper()
	s.feed(html)
	s.strip()
	return s.get_data()


	def split_lines(text):
	return '.\n'.join(text.split('.'))


	if __name__ == "__main__":
	main(sys.argv)