Extract Scribus document text and output HTML for converting to ePub
#! /usr/bin/perl | |
# | |
# Go some way to extracting text from a Scribus document and | |
# converting it to simple HTML that can be used for an ePub file. | |
# | |
# Copyright (c) 2018 Matthew Newton | |
# | |
# Permission is hereby granted, free of charge, to any person | |
# obtaining a copy of this software and associated documentation | |
# files (the "Software"), to deal in the Software without | |
# restriction, including without limitation the rights to use, copy, | |
# modify, merge, publish, distribute, sublicense, and/or sell copies | |
# of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be | |
# included in all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
# DEALINGS IN THE SOFTWARE. | |
# | |
# | |
# This script certainly doesn't do everything, but it's better | |
# than having the options of "save as ASCII text" or nothing. | |
# | |
# The %styles hash defines what actions to take for a paragraph | |
# with name matching the hash key. | |
# | |
# tag: use this tag <around>the</around> paragraph | |
# pre: output this text before the paragraph (and before the opening tag) | |
# post: output this text after the paragraph (and closing tag) | |
# attr: add this attribute to the opening tag | |
# sub: regex substitute this pattern in the paragraph text... | |
# repl: ...and replace with this text instead | |
# | |
# The %fonts hash defines what to do when a font style is found in | |
# paragraph text It will append text before and after the styled | |
# text. This is usually going to be a <span> tag setting the | |
# correct style. | |
# | |
# The suggestion here is to use something like "fixme", which can | |
# then be searched easily in Sigil or similar, and replaced with | |
# the correct style name, as the same font might be used for | |
# multiple things in the source document. | |
use XML::LibXML; | |
use XML::LibXML::Iterator; | |
use utf8; | |
use open ':std', ':encoding(UTF-8)'; | |
my $debug = 0; | |
# | |
# Define paragraph styles that we might find in the source | |
# document. | |
# | |
my %styles = ( | |
"ChapterHeaderNumber" => { | |
"tag" => "", | |
"pre" => '<hr class="sigil_split_marker"/><h1>', | |
"post" => '. ', | |
}, | |
"ChapterHeaderText" => { | |
"tag" => "", | |
"post" => "</h1>\n", | |
}, | |
"SectionHeading" => { | |
"tag" => "h2", | |
"pre" => "\n", | |
}, | |
"SectionSub" => { | |
"tag" => "h3", | |
"pre" => "\n", | |
}, | |
"Default" => { | |
"tag" => "p", | |
}, | |
"Quotation" => { | |
"tag" => "p", | |
"attr" => "class=\"bible\"", | |
}, | |
"Question" => { | |
"tag" => "p", | |
"attr" => "class=\"q\"", | |
}, | |
"BulletList" => { | |
"tag" => "li", | |
"pre" => "\n<ul>\n", | |
"post" => "</ul>\n\n", | |
"sub" => qr/^.$/, | |
"repl" => "", | |
}, | |
); | |
# | |
# What to do when we find text that is a different font. | |
# | |
my %fonts = ( | |
"Delicious Roman" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Bold" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Italic" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Bold Italic" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious Heavy Regular" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
"Delicious SmallCaps Regular" => { | |
"pre" => "<span class=\"fixme\">", | |
"post" => "</span>", | |
}, | |
); | |
# | |
# Load the document and find the bits with text. | |
# | |
my $dom = XML::LibXML->load_xml(location => $ARGV[0]); | |
my $page = $dom->findnodes('/SCRIBUSUTF8NEW/DOCUMENT/PAGEOBJECT/ITEXT'); | |
my $iter = XML::LibXML::Iterator->new($dom); | |
my @output = (); | |
my @text = (); | |
my $prevclass = undef; | |
my $prevstylepost = ""; | |
# | |
# Iterate over all the nodes and look for paragraphs. | |
# | |
while ($iter->nextNode()) { | |
my $node = $iter->current(); | |
my $nn = lc $node->nodeName; | |
# Tab or non-breaking space? | |
# | |
push @text, "\t" if $nn eq "tab"; | |
push @text, " " if $nn eq "nbspace"; | |
next unless $node->hasAttributes(); | |
# Some text | |
# | |
if ($nn eq "itext") { | |
my $line = $node->getAttribute("CH"); | |
my $font = $node->getAttribute("FONT"); | |
if (defined $font) { | |
if (defined $fonts{$font}) { | |
my $pre = $fonts{$font}{"pre"} || ""; | |
my $post = $fonts{$font}{"post"} || ""; | |
$line = "$pre$line$post"; | |
} else { | |
$line = "<span style=\"font-name: $font\">$line</span>"; | |
} | |
} | |
my @attrs = $node->attributes; | |
foreach my $attr (@attrs) { | |
next if $attr->nodeName eq "CH"; | |
next if $attr->nodeName eq "FONT"; | |
print "unparsed attribute '". $attr->nodeName() . "': " . $attr->getValue() . "\n" if $debug; | |
} | |
push @text, $line; | |
} | |
# A paragraph to hold all the preceeding text | |
# | |
elsif ($nn eq "para" or $nn eq "trail") { | |
my $class = $node->getAttribute("PARENT"); | |
$style = $styles{$class}; | |
if ($prevclass and $class ne $prevclass) { | |
push @output, $prevstylepost; | |
push @output, $$style{"pre"} || ""; | |
} | |
$tag = defined $$style{"tag"} ? $$style{"tag"} : "p"; | |
$attr = $$style{"attr"}; | |
$pre = $$style{"pre"} || ""; | |
$post = $$style{"post"} || ""; | |
if (scalar @text) { | |
if ($$style{"sub"}) { | |
$sub = $$style{"sub"}; | |
$repl = $$style{"repl"} || ""; | |
for (my $i=0; $i<=$#text; $i++) { | |
$text[$i] =~ s/$sub/$repl/s; | |
} | |
} | |
if (defined $attr and not defined $tag) { | |
die "style $class with attr but no tag"; | |
} | |
push @output, "<$tag" . (defined $attr ? " $attr" : "") . ">" if $tag; | |
push @output, join("", @text); | |
push @output, "</$tag>\n" if $tag; | |
} | |
@text = (); | |
$prevclass = $class; | |
$prevstylepost = $$style{"post"} || ""; | |
} | |
# Don't handle other things yet | |
# | |
else { | |
next if $nn eq "color"; | |
next if $nn eq "document"; | |
next if $nn eq "style"; | |
next if $nn eq "checkprofile"; | |
next if $nn eq "tabs"; | |
next if $nn eq "effekte"; | |
next if $nn eq "pdf"; | |
next if $nn eq "subset"; | |
next if $nn eq "fonts"; | |
print "unknown nodename:" . $nn . "\n" if $debug; | |
} | |
} | |
push @output, $prevstylepost; | |
my $bibleregex = qr/(?:[123](?: |\s))?[A-Z][a-z]+(?: |\s)\d+(?::\d+)?(?:[-–]\d+(?::\d+)?)?/; | |
foreach my $line (@output) { | |
# | |
# Can process all text before it gets printed out. In this | |
# case we substitute things that *look* like Bible | |
# book/chapter/verse references and put a <span> around | |
# them. | |
# | |
$line =~ s/($bibleregex)/<span class="ref">$1<\/span>/g; | |
print $line; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment