Skip to content

Instantly share code, notes, and snippets.

@hoehrmann
Created February 28, 2013 01:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hoehrmann/5053420 to your computer and use it in GitHub Desktop.
Save hoehrmann/5053420 to your computer and use it in GitHub Desktop.
Take mbox, print HTML version of contents with quoted text marked via <i> elements.
#!perl -w
use Modern::Perl;
use Algorithm::Diff::XS;
use Statistics::Basic qw/median stddev avg/;
use Mail::Mbox::MessageParser;
use MIME::Parser;
use MIME::Parser::Reader;
use Mail::Field;
use Encode;
use HTML::Entities;
use URI::Escape;
binmode STDOUT;
BEGIN {
# avoid annoying warning
local $Mail::Mbox::MessageParser::OLDSTDERR = undef;
}
my %mails;
my $mbox_path = $ARGV[0] // die "Usage: $0 example.mbox\n";
open my $mbox, '<', $mbox_path;
my $reader = Mail::Mbox::MessageParser->new({
file_handle => $mbox,
});
my $mime_parser = MIME::Parser->new;
$mime_parser->output_to_core(1);
while (my $m = $reader->read_next_email) {
my $entity = $mime_parser->parse_data($m);
eval {
my $text = get_text_plain($entity);
$text = decode_utf8($text);
$text =~ s/\x0d\x0a/\x0a/sg;
my $get_mid = sub {
my $s = shift;
return $1 if defined $s and $s =~ /(<\S+?>)/;
"";
};
my $head = $entity->head;
my $mid = $get_mid->($head->get('Message-Id'));
my @refs = map { $get_mid->($_) } split /\s+/,
($head->get('References') // "");
my $irt = $get_mid->($head->get('In-Reply-To'));
my $parent = $irt ? $irt : $refs[-1];
# ...
$mails{$mid} = {
text => $text,
entity => $entity,
parent => $parent
};
}
}
sub get_text_plain {
# Shouldn't there be a module for this?
my $original = shift;
my $e = $original;
my @plain;
if ($e->effective_type eq 'text/plain') {
push @plain, $e;
} elsif ($e->effective_type eq 'multipart/alternative') {
push @plain,
[ reverse grep { $_->effective_type eq 'text/plain' }
$e->parts ]->[0]
if $e->parts;
} elsif ($e->effective_type eq 'multipart/mixed') {
push @plain,
grep { $_->effective_type eq 'text/plain' } $e->parts;
} elsif ($e->effective_type eq 'multipart/signed') {
push @plain,
[ reverse grep { $_->effective_type eq 'text/plain' }
$e->parts ]->[0]
if $e->parts;
} else {
warn "Don't know how to extract plain text from "
. $e->effective_type
. " messages";
}
return join '', map {
my $encoded = $_->bodyhandle->as_string;
my $content_type = $_->get('Content-Type');
if (defined $content_type) {
my $field = Mail::Field->new('Content-Type', $content_type);
my $charset = $field->charset;
if (defined $charset) {
my $decoded = Encode::decode($charset, $encoded, 1);
$encoded = Encode::encode_utf8($decoded);
}
}
$encoded;
} @plain;
}
sub make_word_list {
my $text = shift;
my @words = $text =~ /(\w+|\W)/gs;
my @result;
my $line = 1;
my $column = 1;
for (my $ix = 0; $ix < @words; ++$ix) {
push @result, {
word => $words[$ix],
index => $ix,
word_like => scalar($words[$ix] =~ /^\w+$/),
line => $line,
column => $column++,
};
if ($words[$ix] =~ /[\r\n]/) {
$line++;
$column = 1;
}
}
return @result;
}
print q{
<style>
pre:nth-of-type(odd) { background-color: #eee }
i { color: #ccc; font-style: normal }
pre { white-space: pre-wrap }
</style>
};
while (my ($mid, $obj) = each %mails) {
my @child_words = make_word_list $obj->{text};
next unless defined $obj->{parent};
my $parent_obj = $mails{ $obj->{parent} };
next unless defined $parent_obj;
my @parent_words = make_word_list $parent_obj->{text};
my @child_words_f =
grep { $_->{word_like} || $_->{word} !~ /[\s>]/ } @child_words;
my @parent_words_f =
grep { $_->{word_like} || $_->{word} !~ /[\s>]/ } @parent_words;
my @diff = Algorithm::Diff::sdiff \@parent_words_f,
\@child_words_f, sub {
return $_[0]->{word};
};
$_->{unmodified_in_diff} = 0 for @child_words;
for (@diff) {
next unless $_->[0] eq 'u';
$child_words[ $_->[2]->{index} ]->{unmodified_in_diff} = 1;
}
$_->{quoted} = $_->{unmodified_in_diff} // 0 for @child_words;
# If most characters on a line are not quoted, then assume that all
# the tokens on that line are in fact not quoted. This is to handle
# false positives that are generated due to overlap of frequent
# words like "the" and "and" in english text. But do not do that in
# some edge cases, which are approximated here as not doing it when
# the line is less than the average line long.
my %quoted_chars;
my %unquoted_chars;
my %chars_per_line;
$quoted_chars{ $_->{line} } += length $_->{word}
for grep { $_->{quoted} } @child_words;
$unquoted_chars{ $_->{line} } += length $_->{word}
for grep { !$_->{quoted} } @child_words;
$chars_per_line{ $_->{line} } += length $_->{word}
for @child_words;
# ...
my $median_cpl = 0 + median values %chars_per_line;
my $average_cpl = 0 + avg values %chars_per_line;
my $stddev_cpl = 0 + stddev values %chars_per_line;
for (@child_words) {
my $uc = $unquoted_chars{ $_->{line} } // 0;
my $qc = $quoted_chars{ $_->{line} } // 0;
my $cc = $chars_per_line{ $_->{line} };
next if $qc >= $uc;
next if $cc < $average_cpl;
$_->{quoted} = 0;
}
my $mid_s = $1 if $mid =~ /<(.*)>/;
printf "<pre><a href='http://mid.gmane.org/%s'>news:%s</a>\n\n",
encode_entities(uri_escape($mid_s)), encode_entities($mid_s);
for (@child_words) {
if ($_->{quoted}) {
print "<i>", encode_entities($_->{word}), "</i>";
} else {
print encode_entities($_->{word});
}
}
print "</pre>";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment