mcnewton/scribus2epub.pl

## scribus2epub.pl
#! /usr/bin/perl

#
# Go some way to extracting text from a Scribus document and
# converting it to simple HTML that can be used for an ePub file.
#
# Copyright (c) 2018 Matthew Newton
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy,
# modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
#
# This script certainly doesn't do everything, but it's better
# than having the options of "save as ASCII text" or nothing.
#
# The %styles hash defines what actions to take for a paragraph
# with name matching the hash key.
#
#   tag:  use this tag <around>the</around> paragraph
#   pre:  output this text before the paragraph (and before the opening tag)
#   post: output this text after the paragraph (and closing tag)
#   attr: add this attribute to the opening tag
#   sub:  regex substitute this pattern in the paragraph text...
#   repl: ...and replace with this text instead
#
# The %fonts hash defines what to do when a font style is found in
# paragraph text It will append text before and after the styled
# text. This is usually going to be a <span> tag setting the
# correct style.
#
# The suggestion here is to use something like "fixme", which can
# then be searched easily in Sigil or similar, and replaced with
# the correct style name, as the same font might be used for
# multiple things in the source document.

use XML::LibXML;
use XML::LibXML::Iterator;

use utf8;
use open ':std', ':encoding(UTF-8)';

my $debug = 0;

#
# Define paragraph styles that we might find in the source
# document.
#

my %styles = (
	"ChapterHeaderNumber" => {
		"tag" => "",
		"pre" => '<hr class="sigil_split_marker"/><h1>',
		"post" => '. ',
	},
	"ChapterHeaderText" => {
		"tag" => "",
		"post" => "</h1>\n",
	},
	"SectionHeading" => {
		"tag" => "h2",
		"pre" => "\n",
	},
	"SectionSub" => {
		"tag" => "h3",
		"pre" => "\n",
	},
	"Default" => {
		"tag" => "p",
	},
	"Quotation" => {
		"tag" => "p",
		"attr" => "class=\"bible\"",
	},
	"Question" => {
		"tag" => "p",
		"attr" => "class=\"q\"",
	},
	"BulletList" => {
		"tag" => "li",
		"pre" => "\n<ul>\n",
		"post" => "</ul>\n\n",
		"sub" => qr/^.$/,
		"repl" => "",
	},
);

#
# What to do when we find text that is a different font.
#

my %fonts = (
	"Delicious Roman" => {
		"pre" => "<span class=\"fixme\">",
		"post" => "</span>",
	},
	"Delicious Bold" => {
		"pre" => "<span class=\"fixme\">",
		"post" => "</span>",
	},
	"Delicious Italic" => {
		"pre" => "<span class=\"fixme\">",
		"post" => "</span>",
	},
	"Delicious Bold Italic" => {
		"pre" => "<span class=\"fixme\">",
		"post" => "</span>",
	},
	"Delicious Heavy Regular" => {
		"pre" => "<span class=\"fixme\">",
		"post" => "</span>",
	},
	"Delicious SmallCaps Regular" => {
		"pre" => "<span class=\"fixme\">",
		"post" => "</span>",
	},
);

#
# Load the document and find the bits with text.
#

my $dom = XML::LibXML->load_xml(location => $ARGV[0]);
my $page = $dom->findnodes('/SCRIBUSUTF8NEW/DOCUMENT/PAGEOBJECT/ITEXT');
my $iter = XML::LibXML::Iterator->new($dom);

my @output = ();
my @text = ();
my $prevclass = undef;
my $prevstylepost = "";

#
# Iterate over all the nodes and look for paragraphs.
#

while ($iter->nextNode()) {
	my $node = $iter->current();
	my $nn = lc $node->nodeName;

	# Tab or non-breaking space?
	#
	push @text, "\t" if $nn eq "tab";
	push @text, "&nbsp;" if $nn eq "nbspace";

	next unless $node->hasAttributes();

	# Some text
	#
	if ($nn eq "itext") {
		my $line = $node->getAttribute("CH");
		my $font = $node->getAttribute("FONT");
		if (defined $font) {
			if (defined $fonts{$font}) {
				my $pre = $fonts{$font}{"pre"} || "";
				my $post = $fonts{$font}{"post"} || "";
				$line = "$pre$line$post";
			} else {
				$line = "<span style=\"font-name: $font\">$line</span>";
			}
		}
		my @attrs = $node->attributes;
		foreach my $attr (@attrs) {
			next if $attr->nodeName eq "CH";
			next if $attr->nodeName eq "FONT";
			print "unparsed attribute '". $attr->nodeName() . "': " . $attr->getValue() . "\n" if $debug;
		}
		push @text, $line;
	}

	# A paragraph to hold all the preceeding text
	#
	elsif ($nn eq "para" or $nn eq "trail") {
		my $class = $node->getAttribute("PARENT");
		$style = $styles{$class};

		if ($prevclass and $class ne $prevclass) {
			push @output, $prevstylepost;
			push @output, $$style{"pre"} || "";
		}

		$tag = defined $$style{"tag"} ? $$style{"tag"} : "p";
		$attr = $$style{"attr"};
		$pre = $$style{"pre"} || "";
		$post = $$style{"post"} || "";
		if (scalar @text) {
			if ($$style{"sub"}) {
				$sub = $$style{"sub"};
				$repl = $$style{"repl"} || "";
				for (my $i=0; $i<=$#text; $i++) {
					$text[$i] =~ s/$sub/$repl/s;
				}
			}
			if (defined $attr and not defined $tag) {
				die "style $class with attr but no tag";
			}
			push @output, "<$tag" . (defined $attr ? " $attr" : "") . ">" if $tag;
			push @output, join("", @text);
			push @output, "</$tag>\n" if $tag;
		}

		@text = ();
		$prevclass = $class;
		$prevstylepost = $$style{"post"} || "";
	}

	# Don't handle other things yet
	#
	else {
		next if $nn eq "color";
		next if $nn eq "document";
		next if $nn eq "style";
		next if $nn eq "checkprofile";
		next if $nn eq "tabs";
		next if $nn eq "effekte";
		next if $nn eq "pdf";
		next if $nn eq "subset";
		next if $nn eq "fonts";
		print "unknown nodename:" . $nn . "\n" if $debug;
	}
}
push @output, $prevstylepost;


my $bibleregex = qr/(?:[123](?:&nbsp;|\s))?[A-Z][a-z]+(?:&nbsp;|\s)\d+(?::\d+)?(?:[-–]\d+(?::\d+)?)?/;

foreach my $line (@output) {

	#
	# Can process all text before it gets printed out. In this
	# case we substitute things that *look* like Bible
	# book/chapter/verse references and put a <span> around
	# them.
	#
	$line =~ s/($bibleregex)/<span class="ref">$1<\/span>/g;

	print $line;
}
	#! /usr/bin/perl

	#
	# Go some way to extracting text from a Scribus document and
	# converting it to simple HTML that can be used for an ePub file.
	#
	# Copyright (c) 2018 Matthew Newton
	#
	# Permission is hereby granted, free of charge, to any person
	# obtaining a copy of this software and associated documentation
	# files (the "Software"), to deal in the Software without
	# restriction, including without limitation the rights to use, copy,
	# modify, merge, publish, distribute, sublicense, and/or sell copies
	# of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
	# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	# DEALINGS IN THE SOFTWARE.
	#
	#
	# This script certainly doesn't do everything, but it's better
	# than having the options of "save as ASCII text" or nothing.
	#
	# The %styles hash defines what actions to take for a paragraph
	# with name matching the hash key.
	#
	# tag: use this tag <around>the</around> paragraph
	# pre: output this text before the paragraph (and before the opening tag)
	# post: output this text after the paragraph (and closing tag)
	# attr: add this attribute to the opening tag
	# sub: regex substitute this pattern in the paragraph text...
	# repl: ...and replace with this text instead
	#
	# The %fonts hash defines what to do when a font style is found in
	# paragraph text It will append text before and after the styled
	# text. This is usually going to be a <span> tag setting the
	# correct style.
	#
	# The suggestion here is to use something like "fixme", which can
	# then be searched easily in Sigil or similar, and replaced with
	# the correct style name, as the same font might be used for
	# multiple things in the source document.

	use XML::LibXML;
	use XML::LibXML::Iterator;

	use utf8;
	use open ':std', ':encoding(UTF-8)';

	my $debug = 0;

	#
	# Define paragraph styles that we might find in the source
	# document.
	#

	my %styles = (
	"ChapterHeaderNumber" => {
	"tag" => "",
	"pre" => '<hr class="sigil_split_marker"/><h1>',
	"post" => '. ',
	},
	"ChapterHeaderText" => {
	"tag" => "",
	"post" => "</h1>\n",
	},
	"SectionHeading" => {
	"tag" => "h2",
	"pre" => "\n",
	},
	"SectionSub" => {
	"tag" => "h3",
	"pre" => "\n",
	},
	"Default" => {
	"tag" => "p",
	},
	"Quotation" => {
	"tag" => "p",
	"attr" => "class=\"bible\"",
	},
	"Question" => {
	"tag" => "p",
	"attr" => "class=\"q\"",
	},
	"BulletList" => {
	"tag" => "li",
	"pre" => "\n<ul>\n",
	"post" => "</ul>\n\n",
	"sub" => qr/^.$/,
	"repl" => "",
	},
	);

	#
	# What to do when we find text that is a different font.
	#

	my %fonts = (
	"Delicious Roman" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Bold" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Italic" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Bold Italic" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious Heavy Regular" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	"Delicious SmallCaps Regular" => {
	"pre" => "<span class=\"fixme\">",
	"post" => "</span>",
	},
	);

	#
	# Load the document and find the bits with text.
	#

	my $dom = XML::LibXML->load_xml(location => $ARGV[0]);
	my $page = $dom->findnodes('/SCRIBUSUTF8NEW/DOCUMENT/PAGEOBJECT/ITEXT');
	my $iter = XML::LibXML::Iterator->new($dom);

	my @output = ();
	my @text = ();
	my $prevclass = undef;
	my $prevstylepost = "";

	#
	# Iterate over all the nodes and look for paragraphs.
	#

	while ($iter->nextNode()) {
	my $node = $iter->current();
	my $nn = lc $node->nodeName;

	# Tab or non-breaking space?
	#
	push @text, "\t" if $nn eq "tab";
	push @text, " " if $nn eq "nbspace";

	next unless $node->hasAttributes();

	# Some text
	#
	if ($nn eq "itext") {
	my $line = $node->getAttribute("CH");
	my $font = $node->getAttribute("FONT");
	if (defined $font) {
	if (defined $fonts{$font}) {
	my $pre = $fonts{$font}{"pre"} \|\| "";
	my $post = $fonts{$font}{"post"} \|\| "";
	$line = "$pre$line$post";
	} else {
	$line = "<span style=\"font-name: $font\">$line</span>";
	}
	}
	my @attrs = $node->attributes;
	foreach my $attr (@attrs) {
	next if $attr->nodeName eq "CH";
	next if $attr->nodeName eq "FONT";
	print "unparsed attribute '". $attr->nodeName() . "': " . $attr->getValue() . "\n" if $debug;
	}
	push @text, $line;
	}

	# A paragraph to hold all the preceeding text
	#
	elsif ($nn eq "para" or $nn eq "trail") {
	my $class = $node->getAttribute("PARENT");
	$style = $styles{$class};

	if ($prevclass and $class ne $prevclass) {
	push @output, $prevstylepost;
	push @output, $$style{"pre"} \|\| "";
	}

	$tag = defined $$style{"tag"} ? $$style{"tag"} : "p";
	$attr = $$style{"attr"};
	$pre = $$style{"pre"} \|\| "";
	$post = $$style{"post"} \|\| "";
	if (scalar @text) {
	if ($$style{"sub"}) {
	$sub = $$style{"sub"};
	$repl = $$style{"repl"} \|\| "";
	for (my $i=0; $i<=$#text; $i++) {
	$text[$i] =~ s/$sub/$repl/s;
	}
	}
	if (defined $attr and not defined $tag) {
	die "style $class with attr but no tag";
	}
	push @output, "<$tag" . (defined $attr ? " $attr" : "") . ">" if $tag;
	push @output, join("", @text);
	push @output, "</$tag>\n" if $tag;
	}

	@text = ();
	$prevclass = $class;
	$prevstylepost = $$style{"post"} \|\| "";
	}

	# Don't handle other things yet
	#
	else {
	next if $nn eq "color";
	next if $nn eq "document";
	next if $nn eq "style";
	next if $nn eq "checkprofile";
	next if $nn eq "tabs";
	next if $nn eq "effekte";
	next if $nn eq "pdf";
	next if $nn eq "subset";
	next if $nn eq "fonts";
	print "unknown nodename:" . $nn . "\n" if $debug;
	}
	}
	push @output, $prevstylepost;


	my $bibleregex = qr/(?:[123](?: \|\s))?[A-Z][a-z]+(?: \|\s)\d+(?::\d+)?(?:[-–]\d+(?::\d+)?)?/;

	foreach my $line (@output) {

	#
	# Can process all text before it gets printed out. In this
	# case we substitute things that look like Bible
	# book/chapter/verse references and put a <span> around
	# them.
	#
	$line =~ s/($bibleregex)/<span class="ref">$1<\/span>/g;

	print $line;
	}