Skip to content

Instantly share code, notes, and snippets.

@Xliff
Created April 10, 2016 10:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Xliff/34bb9371787d68898cd8b3891794d93b to your computer and use it in GitHub Desktop.
Save Xliff/34bb9371787d68898cd8b3891794d93b to your computer and use it in GitHub Desktop.

Recently used to clean up a bad PDF-to-Word-to-ePub conversion.

#/usr/bin/perl6

use Inline::Perl5;
#use Mojo::DOM:from<Perl5>;

my $p5 = Inline::Perl5.new;
my $m;
$p5.run(q'
        use Mojo::DOM;

        sub mojo_new($c) {
                $m = Mojo::DOM->new($c);
                print $m;
        }

        sub mojo_find {
                my ($t) = @_;

                return $m->find($t);
        }

        # cw: The above efforts to make this more efficient resulted in miserable
        #     failures.
        sub mojo_newfind {
                my ($c, $t) = @_;

                $m = Mojo::DOM->new($c);

                return $m->find($t);
        }
');

sub change_tag($oldTag, $newTag) {
        $oldTag.replace("<$newTag>" ~ $oldTag.content ~ "</$newTag>");
}

sub replaceSpans($b) {
        # Loop through any <span> tags.
        my $s_col = $b.find('span');
        my $s_ref = $s_col.to_array();
        for @$s_ref -> $s {
                # cw: Yes! There can be nested <span> tags!
                # -XXX- The recursion is causing duplicate nodes!!!
                replaceSpans($s);

                my $s_class = $s.attr('class').Str;

                given $s_class {
                        # Change tag to <em> if it's class "text_[579]" or "text_26"
                        when /text_<[579]> || text_26/ {
                                change_tag($s, 'em');
                        }

                        # Change tag to <b> if it's class is "text_1[2459]" or "text_2[25]"
                        when /text_1<[2459]> || text_2[25]/ {
                                change_tag($s, 'b');
                        }

                        # Change tag to <u> if it's class is "text_1[34]"
                        when /text_1<[34]>/ {
                                change_tag($s, 'u');
                        }

                        # Change tag to <sup> if it's class is "text_6" or "text_10"
                        when "text_6" | "text_10" {
                                change_tag($s, 'sup');
                        }

                        # Leave unchanged if it's class is "text_23"
                        when "text_23" {
                        }

                        # Otherwise, replace the <span>...</span> with its contents
                        $s.replace($s.content);
                }
        }

}

sub removeBreaks($b) {
        my $b_col = $b.find('br');
        my $b_ref = $b_col.to_array();

        for @$b_ref -> $br {
                # Leave breaks in chapter sub-header which class 'calibre2'.
                $br.remove() unless $br.attr('class').Str eq 'calibre2';
        }
}

my $index_count = 1;
for dir(test => /:i ^ newindex_split .+ '.' html $/).sort -> $file {
        my $fh = $file.open;
        my $contents = $fh.slurp-rest;

        say "=== $file ===";
        #$contents ~~ s/.+? ('<' p.+? '>')/$0/;

        my $count = 0;
        #my @tags = $contents ~~  m:g/ :r '<' (<-[\s]>+(.+?)?) '>' (.+) '</' $0 '>'/;
        #$p5.call('mojo_new', $contents);
        my $p_col = $p5.call('mojo_newfind', $contents, 'p');
        my $p_ref = $p_col.to_array();
        my @new_p = ();
        for  @$p_ref -> $p {
                # Change paragraph class to "block_24"
                $p.attr(class => "block_24");

                # Apply <span> tag replacement rules.
                replaceSpans($p);

                # Remove embedded <br> tags.
                removeBreaks($p);

                # If previous paragraph did not end with a punctuation, add text
                # to it.
                # ... otherwise...
                # Add this paragraph to @new_p.
                my $last_P = @new_p.tail[0];
                if (@new_p.end < 2 || $last_P.all_text ~~ /<[ .!?"”— ]> \s* $$/) {
                        @new_p.push($p);
                } else {
                        $last_P.content($last_P.content ~ " " ~ $p.content);
                }
        }

        # This will be in every file. There will be only one.
        my $ch = $p5.call('mojo_newfind', $contents, 'h1');

        # Write new file out to disk.
        my $out = open sprintf("newindex_%03d.html", $index_count++), :w;

        $out.print(q:to"END");
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
  <title>Book Title</title>
  <link href="../Styles/stylesheet.css" rel="stylesheet" type="text/css" />
  <link href="../Styles/page_styles.css" rel="stylesheet" type="text/css" />
</head>

<body class="calibre">
END

        $out.print("\n{$ch.first.to_string}\n");

        for @new_p -> $p {
                $out.print("\n{$p.to_string}\n");
        }

        $out.print("</body>\n</html>");
        $out.close;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment