Skip to content

Instantly share code, notes, and snippets.

@hippietrail
Created December 22, 2010 18:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hippietrail/751910 to your computer and use it in GitHub Desktop.
Save hippietrail/751910 to your computer and use it in GitHub Desktop.
Strip HTML but retain block/inline structure
#!/usr/bin/perl
# TODO img alt text?
# TODO turn into a module so randomwikipage.pl can use it
use strict;
use Encode 'decode_utf8';
use File::DosGlob;
use HTML::Parser;
binmode(STDOUT, ':utf8');
my %unktags; # unknown tags
# from HTML 4.0 DTD
#
# <!-- %inline; covers inline or "text-level" elements -->
# <!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
# <!ENTITY % fontstyle
# "TT | I | B | BIG | SMALL">
# <!ENTITY % phrase "EM | STRONG | DFN | CODE |
# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
# <!ENTITY % special
# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
# <!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
#
#
# <!ENTITY % block
# "P | %heading; | %list; | %preformatted; | DL | DIV | NOSCRIPT |
# BLOCKQUOTE | FORM | HR | TABLE | FIELDSET | ADDRESS">
# <!ENTITY % heading "H1|H2|H3|H4|H5|H6">
# <!ENTITY % list "UL | OL">
# <!ENTITY % preformatted "PRE">
my %tagtype = (
'TT' => 'inline', 'I' => 'inline', 'B' => 'inline', 'BIG' => 'inline',
'SMALL' => 'inline',
'EM' => 'inline', 'STRONG' => 'inline', 'DFN' => 'inline',
'CODE' => 'inline',
'SAMP' => 'inline', 'KBD' => 'inline', 'VAR' => 'inline',
'CITE' => 'inline', 'ABBR' => 'inline', 'ACRONYM' => 'inline',
'A' => 'inline', 'IMG' => 'inline', 'OBJECT' => 'inline', 'BR' => 'inline',
'SCRIPT' => 'inline', 'MAP' => 'inline', 'Q' => 'inline', 'SUB' => 'inline',
'SUP' => 'inline', 'SPAN' => 'inline', 'BDO' => 'inline',
'INPUT' => 'inline', 'SELECT' => 'inline', 'TEXTAREA' => 'inline',
'LABEL' => 'inline', 'BUTTON' => 'inline',
'P' => 'block', 'DL' => 'block', 'DIV' => 'block', 'NOSCRIPT' => 'block',
'BLOCKQUOTE' => 'block', 'FORM' => 'block', 'HR' => 'block',
'TABLE' => 'block', 'FIELDSET' => 'block', 'ADDRESS' => 'block',
'H1' => 'block', 'H2' => 'block', 'H3' => 'block', 'H4' => 'block',
'H5' => 'block', 'H6' => 'block',
'UL' => 'block', 'OL' => 'block',
'PRE' => 'block',
# not defined as block in the dtd but usually breaks lines
'LI' => 'block', 'DD' => 'block', 'DT' => 'block',
'IFRAME' => 'block', 'TBODY' => 'block', 'TR' => 'block', 'TD' => 'block',
# not defined as inline in the dtd but testing shows they behave so
'OPTION' => 'inline', 'PARAM' => 'inline', 'EMBED' => 'inline',
'FONT' => 'inline',
# peculiarity of HTML::Parser??
'BR/' => 'inline',
# TODO BASE, CENTER, AREA, TH, LEGEND, INS, THEAD, BLINK, DEL, MARQUEE
);
my @files = glob "@ARGV";
print STDERR join("\n", @files), "\n";
for (@files) {
process_file($_);
}
print STDERR "----\n\n";
#dump_unknown_tags();
exit;
####################################################################
sub process_file {
my $filename = shift;
print STDERR "stripping $filename...\n";
our %inside = ();
# slurp in an HTML file
open(FH, $filename);
my @content = <FH>;
close(FH);
my $content = join('', @content);
our $txt = '';
my $parser = HTML::Parser->new(
start_h => [\&tag, "tagname, '+1'"],
end_h => [\&tag, "tagname, '-1'"],
text_h => [\&text, "dtext"],
);
$parser->parse( decode_utf8( $content ) );
$parser->eof;
# clean up whitespace etc
# turn lines with just space to blank lines
$txt =~ s/^[ \t]*(.*?)[ \t]*$/\1/mg;
# minify paragraph breaks
$txt =~ s/\n\n\n+/\n\n/g;
# minify spaces and tabs
$txt =~ s/[ \t]+/ /g;
print $txt;
dump_unknown_tags();
return;
################################################################
sub tag {
my($tag, $opt_n) = @_;
$inside{$tag} += $opt_n;
my $type = $tagtype{uc $tag};
# do nothing for inline tags, script and style tags
if ($type eq 'inline' || $tag eq 'script' || $tag eq 'style') {
# \n for <br>
if ($tag eq 'br' || $tag eq 'br/') {
$txt .= "\n";
} elsif ($tag eq 'img') {
$txt .= ' ';
}
# \n\n for block tags
} elsif ($type eq 'block') {
$txt .= "\n\n";
# neither inline nor block!
} else {
unless ($inside{script} || $inside{style}) {
unless (grep (/^$tag$/, ('html', 'head', 'title', 'meta', 'link', 'body'))) {
++ $unktags{$tag} if $opt_n == 1;
}
}
}
}
sub text {
return if $inside{script} || $inside{style};
return if $inside{head} && !$inside{title};
my $t = $_[0];
$t =~ s/\s+/ /g;
$txt .= $t;
}
}
sub dump_unknown_tags {
foreach (sort {$unktags{$b} <=> $unktags{$a}} keys %unktags) {
print STDERR "$_: $unktags{$_}\n";
}
}
@hippietrail
Copy link
Author

This is my answer (so far) to my own question on stackoverflow:

http://stackoverflow.com/questions/4396497/stripping-html-but-retaining-block-inline-structure

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment