Skip to content

Instantly share code, notes, and snippets.

@mcnewton
Created March 18, 2018 21:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mcnewton/f8ff21b9b16b7e307d08f214f674a9d1 to your computer and use it in GitHub Desktop.
Save mcnewton/f8ff21b9b16b7e307d08f214f674a9d1 to your computer and use it in GitHub Desktop.
Extract Scribus document text and output HTML for converting to ePub
#! /usr/bin/perl
#
# Go some way to extracting text from a Scribus document and
# converting it to simple HTML that can be used for an ePub file.
#
# Copyright (c) 2018 Matthew Newton
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy,
# modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
#
# This script certainly doesn't do everything, but it's better
# than having the options of "save as ASCII text" or nothing.
#
# The %styles hash defines what actions to take for a paragraph
# with name matching the hash key.
#
# tag: use this tag <around>the</around> paragraph
# pre: output this text before the paragraph (and before the opening tag)
# post: output this text after the paragraph (and closing tag)
# attr: add this attribute to the opening tag
# sub: regex substitute this pattern in the paragraph text...
# repl: ...and replace with this text instead
#
# The %fonts hash defines what to do when a font style is found in
# paragraph text It will append text before and after the styled
# text. This is usually going to be a <span> tag setting the
# correct style.
#
# The suggestion here is to use something like "fixme", which can
# then be searched easily in Sigil or similar, and replaced with
# the correct style name, as the same font might be used for
# multiple things in the source document.
use XML::LibXML;
use XML::LibXML::Iterator;
use utf8;
use open ':std', ':encoding(UTF-8)';
my $debug = 0;
#
# Define paragraph styles that we might find in the source
# document.
#
my %styles = (
"ChapterHeaderNumber" => {
"tag" => "",
"pre" => '<hr class="sigil_split_marker"/><h1>',
"post" => '. ',
},
"ChapterHeaderText" => {
"tag" => "",
"post" => "</h1>\n",
},
"SectionHeading" => {
"tag" => "h2",
"pre" => "\n",
},
"SectionSub" => {
"tag" => "h3",
"pre" => "\n",
},
"Default" => {
"tag" => "p",
},
"Quotation" => {
"tag" => "p",
"attr" => "class=\"bible\"",
},
"Question" => {
"tag" => "p",
"attr" => "class=\"q\"",
},
"BulletList" => {
"tag" => "li",
"pre" => "\n<ul>\n",
"post" => "</ul>\n\n",
"sub" => qr/^.$/,
"repl" => "",
},
);
#
# What to do when we find text that is a different font.
#
my %fonts = (
"Delicious Roman" => {
"pre" => "<span class=\"fixme\">",
"post" => "</span>",
},
"Delicious Bold" => {
"pre" => "<span class=\"fixme\">",
"post" => "</span>",
},
"Delicious Italic" => {
"pre" => "<span class=\"fixme\">",
"post" => "</span>",
},
"Delicious Bold Italic" => {
"pre" => "<span class=\"fixme\">",
"post" => "</span>",
},
"Delicious Heavy Regular" => {
"pre" => "<span class=\"fixme\">",
"post" => "</span>",
},
"Delicious SmallCaps Regular" => {
"pre" => "<span class=\"fixme\">",
"post" => "</span>",
},
);
#
# Load the document and find the bits with text.
#
my $dom = XML::LibXML->load_xml(location => $ARGV[0]);
my $page = $dom->findnodes('/SCRIBUSUTF8NEW/DOCUMENT/PAGEOBJECT/ITEXT');
my $iter = XML::LibXML::Iterator->new($dom);
my @output = ();
my @text = ();
my $prevclass = undef;
my $prevstylepost = "";
#
# Iterate over all the nodes and look for paragraphs.
#
while ($iter->nextNode()) {
my $node = $iter->current();
my $nn = lc $node->nodeName;
# Tab or non-breaking space?
#
push @text, "\t" if $nn eq "tab";
push @text, "&nbsp;" if $nn eq "nbspace";
next unless $node->hasAttributes();
# Some text
#
if ($nn eq "itext") {
my $line = $node->getAttribute("CH");
my $font = $node->getAttribute("FONT");
if (defined $font) {
if (defined $fonts{$font}) {
my $pre = $fonts{$font}{"pre"} || "";
my $post = $fonts{$font}{"post"} || "";
$line = "$pre$line$post";
} else {
$line = "<span style=\"font-name: $font\">$line</span>";
}
}
my @attrs = $node->attributes;
foreach my $attr (@attrs) {
next if $attr->nodeName eq "CH";
next if $attr->nodeName eq "FONT";
print "unparsed attribute '". $attr->nodeName() . "': " . $attr->getValue() . "\n" if $debug;
}
push @text, $line;
}
# A paragraph to hold all the preceeding text
#
elsif ($nn eq "para" or $nn eq "trail") {
my $class = $node->getAttribute("PARENT");
$style = $styles{$class};
if ($prevclass and $class ne $prevclass) {
push @output, $prevstylepost;
push @output, $$style{"pre"} || "";
}
$tag = defined $$style{"tag"} ? $$style{"tag"} : "p";
$attr = $$style{"attr"};
$pre = $$style{"pre"} || "";
$post = $$style{"post"} || "";
if (scalar @text) {
if ($$style{"sub"}) {
$sub = $$style{"sub"};
$repl = $$style{"repl"} || "";
for (my $i=0; $i<=$#text; $i++) {
$text[$i] =~ s/$sub/$repl/s;
}
}
if (defined $attr and not defined $tag) {
die "style $class with attr but no tag";
}
push @output, "<$tag" . (defined $attr ? " $attr" : "") . ">" if $tag;
push @output, join("", @text);
push @output, "</$tag>\n" if $tag;
}
@text = ();
$prevclass = $class;
$prevstylepost = $$style{"post"} || "";
}
# Don't handle other things yet
#
else {
next if $nn eq "color";
next if $nn eq "document";
next if $nn eq "style";
next if $nn eq "checkprofile";
next if $nn eq "tabs";
next if $nn eq "effekte";
next if $nn eq "pdf";
next if $nn eq "subset";
next if $nn eq "fonts";
print "unknown nodename:" . $nn . "\n" if $debug;
}
}
push @output, $prevstylepost;
my $bibleregex = qr/(?:[123](?:&nbsp;|\s))?[A-Z][a-z]+(?:&nbsp;|\s)\d+(?::\d+)?(?:[-–]\d+(?::\d+)?)?/;
foreach my $line (@output) {
#
# Can process all text before it gets printed out. In this
# case we substitute things that *look* like Bible
# book/chapter/verse references and put a <span> around
# them.
#
$line =~ s/($bibleregex)/<span class="ref">$1<\/span>/g;
print $line;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment