Recently used to clean up a bad PDF-to-Word-to-ePub conversion.
#/usr/bin/perl6
use Inline::Perl5;
#use Mojo::DOM:from<Perl5>;
my $p5 = Inline::Perl5.new;
my $m;
$p5.run(q'
use Mojo::DOM;
sub mojo_new($c) {
$m = Mojo::DOM->new($c);
print $m;
}
sub mojo_find {
my ($t) = @_;
return $m->find($t);
}
# cw: The above efforts to make this more efficient resulted in miserable
# failures.
sub mojo_newfind {
my ($c, $t) = @_;
$m = Mojo::DOM->new($c);
return $m->find($t);
}
');
sub change_tag($oldTag, $newTag) {
$oldTag.replace("<$newTag>" ~ $oldTag.content ~ "</$newTag>");
}
sub replaceSpans($b) {
# Loop through any <span> tags.
my $s_col = $b.find('span');
my $s_ref = $s_col.to_array();
for @$s_ref -> $s {
# cw: Yes! There can be nested <span> tags!
# -XXX- The recursion is causing duplicate nodes!!!
replaceSpans($s);
my $s_class = $s.attr('class').Str;
given $s_class {
# Change tag to <em> if it's class "text_[579]" or "text_26"
when /text_<[579]> || text_26/ {
change_tag($s, 'em');
}
# Change tag to <b> if it's class is "text_1[2459]" or "text_2[25]"
when /text_1<[2459]> || text_2[25]/ {
change_tag($s, 'b');
}
# Change tag to <u> if it's class is "text_1[34]"
when /text_1<[34]>/ {
change_tag($s, 'u');
}
# Change tag to <sup> if it's class is "text_6" or "text_10"
when "text_6" | "text_10" {
change_tag($s, 'sup');
}
# Leave unchanged if it's class is "text_23"
when "text_23" {
}
# Otherwise, replace the <span>...</span> with its contents
$s.replace($s.content);
}
}
}
sub removeBreaks($b) {
my $b_col = $b.find('br');
my $b_ref = $b_col.to_array();
for @$b_ref -> $br {
# Leave breaks in chapter sub-header which class 'calibre2'.
$br.remove() unless $br.attr('class').Str eq 'calibre2';
}
}
my $index_count = 1;
for dir(test => /:i ^ newindex_split .+ '.' html $/).sort -> $file {
my $fh = $file.open;
my $contents = $fh.slurp-rest;
say "=== $file ===";
#$contents ~~ s/.+? ('<' p.+? '>')/$0/;
my $count = 0;
#my @tags = $contents ~~ m:g/ :r '<' (<-[\s]>+(.+?)?) '>' (.+) '</' $0 '>'/;
#$p5.call('mojo_new', $contents);
my $p_col = $p5.call('mojo_newfind', $contents, 'p');
my $p_ref = $p_col.to_array();
my @new_p = ();
for @$p_ref -> $p {
# Change paragraph class to "block_24"
$p.attr(class => "block_24");
# Apply <span> tag replacement rules.
replaceSpans($p);
# Remove embedded <br> tags.
removeBreaks($p);
# If previous paragraph did not end with a punctuation, add text
# to it.
# ... otherwise...
# Add this paragraph to @new_p.
my $last_P = @new_p.tail[0];
if (@new_p.end < 2 || $last_P.all_text ~~ /<[ .!?"”— ]> \s* $$/) {
@new_p.push($p);
} else {
$last_P.content($last_P.content ~ " " ~ $p.content);
}
}
# This will be in every file. There will be only one.
my $ch = $p5.call('mojo_newfind', $contents, 'h1');
# Write new file out to disk.
my $out = open sprintf("newindex_%03d.html", $index_count++), :w;
$out.print(q:to"END");
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>Book Title</title>
<link href="../Styles/stylesheet.css" rel="stylesheet" type="text/css" />
<link href="../Styles/page_styles.css" rel="stylesheet" type="text/css" />
</head>
<body class="calibre">
END
$out.print("\n{$ch.first.to_string}\n");
for @new_p -> $p {
$out.print("\n{$p.to_string}\n");
}
$out.print("</body>\n</html>");
$out.close;
}