hippietrail/html2txt.pl

## html2txt.pl
#!/usr/bin/perl

# TODO img alt text?
# TODO turn into a module so randomwikipage.pl can use it

use strict;

use Encode 'decode_utf8';
use File::DosGlob;
use HTML::Parser;

binmode(STDOUT, ':utf8');

my %unktags;    # unknown tags

# from HTML 4.0 DTD
#
# <!-- %inline; covers inline or "text-level" elements -->
# <!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
# <!ENTITY % fontstyle
#  "TT | I | B | BIG | SMALL">
# <!ENTITY % phrase "EM | STRONG | DFN | CODE |
#                    SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
# <!ENTITY % special
#    "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
# <!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
#
#
# <!ENTITY % block
#      "P | %heading; | %list; | %preformatted; | DL | DIV | NOSCRIPT |
#       BLOCKQUOTE | FORM | HR | TABLE | FIELDSET | ADDRESS">
# <!ENTITY % heading "H1|H2|H3|H4|H5|H6">
# <!ENTITY % list "UL | OL">
# <!ENTITY % preformatted "PRE">

my %tagtype = (
    'TT' => 'inline', 'I' => 'inline', 'B' => 'inline', 'BIG' => 'inline',
    'SMALL' => 'inline',

    'EM' => 'inline', 'STRONG' => 'inline', 'DFN' => 'inline',
    'CODE' => 'inline',

    'SAMP' => 'inline', 'KBD' => 'inline', 'VAR' => 'inline',
    'CITE' => 'inline', 'ABBR' => 'inline', 'ACRONYM' => 'inline',

    'A' => 'inline', 'IMG' => 'inline', 'OBJECT' => 'inline', 'BR' => 'inline',
    'SCRIPT' => 'inline', 'MAP' => 'inline', 'Q' => 'inline', 'SUB' => 'inline',
    'SUP' => 'inline', 'SPAN' => 'inline', 'BDO' => 'inline',

    'INPUT' => 'inline', 'SELECT' => 'inline', 'TEXTAREA' => 'inline',
    'LABEL' => 'inline', 'BUTTON' => 'inline',

    'P' => 'block', 'DL' => 'block', 'DIV' => 'block', 'NOSCRIPT' => 'block',

    'BLOCKQUOTE' => 'block', 'FORM' => 'block', 'HR' => 'block',
    'TABLE' => 'block', 'FIELDSET' => 'block', 'ADDRESS' => 'block',

    'H1' => 'block', 'H2' => 'block', 'H3' => 'block', 'H4' => 'block',
    'H5' => 'block', 'H6' => 'block',

    'UL' => 'block', 'OL' => 'block',

    'PRE' => 'block',

    # not defined as block in the dtd but usually breaks lines
    'LI' => 'block', 'DD' => 'block', 'DT' => 'block',
    'IFRAME' => 'block', 'TBODY' => 'block', 'TR' => 'block', 'TD' => 'block',

    # not defined as inline in the dtd but testing shows they behave so
    'OPTION' => 'inline', 'PARAM' => 'inline', 'EMBED' => 'inline',
    'FONT' => 'inline',

    # peculiarity of HTML::Parser??
    'BR/' => 'inline',

    # TODO BASE, CENTER, AREA, TH, LEGEND, INS, THEAD, BLINK, DEL, MARQUEE
);

my @files = glob "@ARGV";

print STDERR join("\n", @files), "\n";

for (@files) {
    process_file($_);
}

print STDERR "----\n\n";

#dump_unknown_tags();

exit;

####################################################################

sub process_file {
    my $filename = shift;

    print STDERR "stripping $filename...\n";

    our %inside = ();

    # slurp in an HTML file
    open(FH, $filename);
    my @content = <FH>;
    close(FH);

    my $content = join('', @content);

    our $txt = '';

    my $parser = HTML::Parser->new(
                       start_h => [\&tag,  "tagname, '+1'"],
                       end_h   => [\&tag,  "tagname, '-1'"],
                       text_h  => [\&text, "dtext"],
                      );

    $parser->parse( decode_utf8( $content ) );

    $parser->eof;

    # clean up whitespace etc

    # turn lines with just space to blank lines
    $txt =~ s/^[ \t]*(.*?)[ \t]*$/\1/mg;

    # minify paragraph breaks
    $txt =~ s/\n\n\n+/\n\n/g;

    # minify spaces and tabs
    $txt =~ s/[ \t]+/ /g;

    print $txt;

    dump_unknown_tags();

    return;

    ################################################################

    sub tag {
        my($tag, $opt_n) = @_;

        $inside{$tag} += $opt_n;

        my $type = $tagtype{uc $tag};

        # do nothing for inline tags, script and style tags
        if ($type eq 'inline' || $tag eq 'script' || $tag eq 'style') {
            # \n for <br>
            if ($tag eq 'br' || $tag eq 'br/') {
                $txt .= "\n";
            } elsif ($tag eq 'img') {
                $txt .= ' ';
            }
        # \n\n for block tags
        } elsif ($type eq 'block') {
            $txt .= "\n\n";
        # neither inline nor block!
        } else {
            unless ($inside{script} || $inside{style}) {
                unless (grep (/^$tag$/, ('html', 'head', 'title', 'meta', 'link', 'body'))) {
                    ++ $unktags{$tag} if $opt_n == 1;
                }
            }
        }
    }

    sub text {
        return if $inside{script} || $inside{style};
        return if $inside{head} && !$inside{title};

        my $t = $_[0];

        $t =~ s/\s+/ /g;

        $txt .= $t;
    }
}

sub dump_unknown_tags {
    foreach (sort {$unktags{$b} <=> $unktags{$a}} keys %unktags) {
        print STDERR "$_: $unktags{$_}\n";
    }
}
	#!/usr/bin/perl

	# TODO img alt text?
	# TODO turn into a module so randomwikipage.pl can use it

	use strict;

	use Encode 'decode_utf8';
	use File::DosGlob;
	use HTML::Parser;

	binmode(STDOUT, ':utf8');

	my %unktags; # unknown tags

	# from HTML 4.0 DTD
	#
	# <!-- %inline; covers inline or "text-level" elements -->
	# <!ENTITY % inline "#PCDATA \| %fontstyle; \| %phrase; \| %special; \| %formctrl;">
	# <!ENTITY % fontstyle
	# "TT \| I \| B \| BIG \| SMALL">
	# <!ENTITY % phrase "EM \| STRONG \| DFN \| CODE \|
	# SAMP \| KBD \| VAR \| CITE \| ABBR \| ACRONYM" >
	# <!ENTITY % special
	# "A \| IMG \| OBJECT \| BR \| SCRIPT \| MAP \| Q \| SUB \| SUP \| SPAN \| BDO">
	# <!ENTITY % formctrl "INPUT \| SELECT \| TEXTAREA \| LABEL \| BUTTON">
	#
	#
	# <!ENTITY % block
	# "P \| %heading; \| %list; \| %preformatted; \| DL \| DIV \| NOSCRIPT \|
	# BLOCKQUOTE \| FORM \| HR \| TABLE \| FIELDSET \| ADDRESS">
	# <!ENTITY % heading "H1\|H2\|H3\|H4\|H5\|H6">
	# <!ENTITY % list "UL \| OL">
	# <!ENTITY % preformatted "PRE">

	my %tagtype = (
	'TT' => 'inline', 'I' => 'inline', 'B' => 'inline', 'BIG' => 'inline',
	'SMALL' => 'inline',

	'EM' => 'inline', 'STRONG' => 'inline', 'DFN' => 'inline',
	'CODE' => 'inline',

	'SAMP' => 'inline', 'KBD' => 'inline', 'VAR' => 'inline',
	'CITE' => 'inline', 'ABBR' => 'inline', 'ACRONYM' => 'inline',

	'A' => 'inline', 'IMG' => 'inline', 'OBJECT' => 'inline', 'BR' => 'inline',
	'SCRIPT' => 'inline', 'MAP' => 'inline', 'Q' => 'inline', 'SUB' => 'inline',
	'SUP' => 'inline', 'SPAN' => 'inline', 'BDO' => 'inline',

	'INPUT' => 'inline', 'SELECT' => 'inline', 'TEXTAREA' => 'inline',
	'LABEL' => 'inline', 'BUTTON' => 'inline',

	'P' => 'block', 'DL' => 'block', 'DIV' => 'block', 'NOSCRIPT' => 'block',

	'BLOCKQUOTE' => 'block', 'FORM' => 'block', 'HR' => 'block',
	'TABLE' => 'block', 'FIELDSET' => 'block', 'ADDRESS' => 'block',

	'H1' => 'block', 'H2' => 'block', 'H3' => 'block', 'H4' => 'block',
	'H5' => 'block', 'H6' => 'block',

	'UL' => 'block', 'OL' => 'block',

	'PRE' => 'block',

	# not defined as block in the dtd but usually breaks lines
	'LI' => 'block', 'DD' => 'block', 'DT' => 'block',
	'IFRAME' => 'block', 'TBODY' => 'block', 'TR' => 'block', 'TD' => 'block',

	# not defined as inline in the dtd but testing shows they behave so
	'OPTION' => 'inline', 'PARAM' => 'inline', 'EMBED' => 'inline',
	'FONT' => 'inline',

	# peculiarity of HTML::Parser??
	'BR/' => 'inline',

	# TODO BASE, CENTER, AREA, TH, LEGEND, INS, THEAD, BLINK, DEL, MARQUEE
	);

	my @files = glob "@ARGV";

	print STDERR join("\n", @files), "\n";

	for (@files) {
	process_file($_);
	}

	print STDERR "----\n\n";

	#dump_unknown_tags();

	exit;

	####################################################################

	sub process_file {
	my $filename = shift;

	print STDERR "stripping $filename...\n";

	our %inside = ();

	# slurp in an HTML file
	open(FH, $filename);
	my @content = <FH>;
	close(FH);

	my $content = join('', @content);

	our $txt = '';

	my $parser = HTML::Parser->new(
	start_h => [\&tag, "tagname, '+1'"],
	end_h => [\&tag, "tagname, '-1'"],
	text_h => [\&text, "dtext"],
	);

	$parser->parse( decode_utf8( $content ) );

	$parser->eof;

	# clean up whitespace etc

	# turn lines with just space to blank lines
	$txt =~ s/^[ \t](.?)[ \t]*$/\1/mg;

	# minify paragraph breaks
	$txt =~ s/\n\n\n+/\n\n/g;

	# minify spaces and tabs
	$txt =~ s/[ \t]+/ /g;

	print $txt;

	dump_unknown_tags();

	return;

	################################################################

	sub tag {
	my($tag, $opt_n) = @_;

	$inside{$tag} += $opt_n;

	my $type = $tagtype{uc $tag};

	# do nothing for inline tags, script and style tags
	if ($type eq 'inline' \|\| $tag eq 'script' \|\| $tag eq 'style') {
	# \n for <br>
	if ($tag eq 'br' \|\| $tag eq 'br/') {
	$txt .= "\n";
	} elsif ($tag eq 'img') {
	$txt .= ' ';
	}
	# \n\n for block tags
	} elsif ($type eq 'block') {
	$txt .= "\n\n";
	# neither inline nor block!
	} else {
	unless ($inside{script} \|\| $inside{style}) {
	unless (grep (/^$tag$/, ('html', 'head', 'title', 'meta', 'link', 'body'))) {
	++ $unktags{$tag} if $opt_n == 1;
	}
	}
	}
	}

	sub text {
	return if $inside{script} \|\| $inside{style};
	return if $inside{head} && !$inside{title};

	my $t = $_[0];

	$t =~ s/\s+/ /g;

	$txt .= $t;
	}
	}

	sub dump_unknown_tags {
	foreach (sort {$unktags{$b} <=> $unktags{$a}} keys %unktags) {
	print STDERR "$_: $unktags{$_}\n";
	}
	}