hoehrmann/identify-quoted-text.pl

## identify-quoted-text.pl
#!perl -w
use Modern::Perl;
use Algorithm::Diff::XS;
use Statistics::Basic qw/median stddev avg/;
use Mail::Mbox::MessageParser;
use MIME::Parser;
use MIME::Parser::Reader;
use Mail::Field;
use Encode;
use HTML::Entities;
use URI::Escape;

binmode STDOUT;

BEGIN {
  # avoid annoying warning
  local $Mail::Mbox::MessageParser::OLDSTDERR = undef;
}

my %mails;
my $mbox_path = $ARGV[0] // die "Usage: $0 example.mbox\n";

open my $mbox, '<', $mbox_path;

my $reader = Mail::Mbox::MessageParser->new({
  file_handle => $mbox,
});

my $mime_parser = MIME::Parser->new;
$mime_parser->output_to_core(1);

while (my $m = $reader->read_next_email) {
  my $entity = $mime_parser->parse_data($m);

  eval {
    my $text = get_text_plain($entity);
    $text = decode_utf8($text);

    $text =~ s/\x0d\x0a/\x0a/sg;

    my $get_mid = sub {
      my $s = shift;
      return $1 if defined $s and $s =~ /(<\S+?>)/;
      "";
    };

    my $head = $entity->head;
    my $mid  = $get_mid->($head->get('Message-Id'));
    my @refs = map { $get_mid->($_) } split /\s+/,
      ($head->get('References') // "");
    my $irt = $get_mid->($head->get('In-Reply-To'));
    my $parent = $irt ? $irt : $refs[-1];

    # ...
    $mails{$mid} = {
      text   => $text,
      entity => $entity,
      parent => $parent
    };

  }

}

sub get_text_plain {

  # Shouldn't there be a module for this?

  my $original = shift;
  my $e        = $original;
  my @plain;

  if ($e->effective_type eq 'text/plain') {
    push @plain, $e;
  } elsif ($e->effective_type eq 'multipart/alternative') {
    push @plain,
      [ reverse grep { $_->effective_type eq 'text/plain' }
        $e->parts ]->[0]
      if $e->parts;
  } elsif ($e->effective_type eq 'multipart/mixed') {
    push @plain,
      grep { $_->effective_type eq 'text/plain' } $e->parts;
  } elsif ($e->effective_type eq 'multipart/signed') {
    push @plain,
      [ reverse grep { $_->effective_type eq 'text/plain' }
        $e->parts ]->[0]
      if $e->parts;
  } else {
    warn "Don't know how to extract plain text from "
      . $e->effective_type
      . " messages";
  }

  return join '', map {
    my $encoded      = $_->bodyhandle->as_string;
    my $content_type = $_->get('Content-Type');
    if (defined $content_type) {
      my $field = Mail::Field->new('Content-Type', $content_type);
      my $charset = $field->charset;
      if (defined $charset) {
        my $decoded = Encode::decode($charset, $encoded, 1);
        $encoded = Encode::encode_utf8($decoded);
      }
    }
    $encoded;
  } @plain;
}

sub make_word_list {
  my $text  = shift;
  my @words = $text =~ /(\w+|\W)/gs;
  my @result;
  my $line = 1;
  my $column = 1;

  for (my $ix = 0; $ix < @words; ++$ix) {
    push @result, {
      word      => $words[$ix],
      index     => $ix,
      word_like => scalar($words[$ix] =~ /^\w+$/),
      line      => $line,
      column    => $column++,
    };
    if ($words[$ix] =~ /[\r\n]/) {
      $line++;
      $column = 1;
    }
  }
  return @result;
}

print q{

<style>
pre:nth-of-type(odd) { background-color: #eee }
i { color: #ccc; font-style: normal }
pre { white-space: pre-wrap }
</style>

};

while (my ($mid, $obj) = each %mails) {

  my @child_words = make_word_list $obj->{text};

  next unless defined $obj->{parent};

  my $parent_obj  = $mails{ $obj->{parent} };

  next unless defined $parent_obj;

  my @parent_words = make_word_list $parent_obj->{text};

  my @child_words_f =
    grep { $_->{word_like} || $_->{word} !~ /[\s>]/ } @child_words;
  my @parent_words_f =
    grep { $_->{word_like} || $_->{word} !~ /[\s>]/ } @parent_words;

  my @diff = Algorithm::Diff::sdiff \@parent_words_f,
    \@child_words_f, sub {
    return $_[0]->{word};
    };

  $_->{unmodified_in_diff} = 0 for @child_words;

  for (@diff) {
    next unless $_->[0] eq 'u';
    $child_words[ $_->[2]->{index} ]->{unmodified_in_diff} = 1;
  }

  $_->{quoted} = $_->{unmodified_in_diff} // 0 for @child_words;

  # If most characters on a line are not quoted, then assume that all
  # the tokens on that line are in fact not quoted. This is to handle
  # false positives that are generated due to overlap of frequent
  # words like "the" and "and" in english text. But do not do that in
  # some edge cases, which are approximated here as not doing it when
  # the line is less than the average line long.

  my %quoted_chars;
  my %unquoted_chars;
  my %chars_per_line;

  $quoted_chars{ $_->{line} } += length $_->{word}
    for grep { $_->{quoted} } @child_words;

  $unquoted_chars{ $_->{line} } += length $_->{word}
    for grep { !$_->{quoted} } @child_words;

  $chars_per_line{ $_->{line} } += length $_->{word}
    for @child_words;

  # ...
  my $median_cpl = 0 + median values %chars_per_line;
  my $average_cpl = 0 + avg values %chars_per_line;
  my $stddev_cpl = 0 + stddev values %chars_per_line;

  for (@child_words) {
    my $uc = $unquoted_chars{ $_->{line} } // 0;
    my $qc = $quoted_chars{ $_->{line} } // 0;
    my $cc = $chars_per_line{ $_->{line} };

    next if $qc >= $uc;
    next if $cc < $average_cpl;

    $_->{quoted} = 0;
  }

  my $mid_s = $1 if $mid =~ /<(.*)>/;

  printf "<pre><a href='http://mid.gmane.org/%s'>news:%s</a>\n\n",
    encode_entities(uri_escape($mid_s)), encode_entities($mid_s);

  for (@child_words) {
    if ($_->{quoted}) {
      print "<i>", encode_entities($_->{word}), "</i>";
    } else {
      print encode_entities($_->{word});
    }
  }
  print "</pre>";

}
	#!perl -w
	use Modern::Perl;
	use Algorithm::Diff::XS;
	use Statistics::Basic qw/median stddev avg/;
	use Mail::Mbox::MessageParser;
	use MIME::Parser;
	use MIME::Parser::Reader;
	use Mail::Field;
	use Encode;
	use HTML::Entities;
	use URI::Escape;

	binmode STDOUT;

	BEGIN {
	# avoid annoying warning
	local $Mail::Mbox::MessageParser::OLDSTDERR = undef;
	}

	my %mails;
	my $mbox_path = $ARGV[0] // die "Usage: $0 example.mbox\n";

	open my $mbox, '<', $mbox_path;

	my $reader = Mail::Mbox::MessageParser->new({
	file_handle => $mbox,
	});

	my $mime_parser = MIME::Parser->new;
	$mime_parser->output_to_core(1);

	while (my $m = $reader->read_next_email) {
	my $entity = $mime_parser->parse_data($m);

	eval {
	my $text = get_text_plain($entity);
	$text = decode_utf8($text);

	$text =~ s/\x0d\x0a/\x0a/sg;

	my $get_mid = sub {
	my $s = shift;
	return $1 if defined $s and $s =~ /(<\S+?>)/;
	"";
	};

	my $head = $entity->head;
	my $mid = $get_mid->($head->get('Message-Id'));
	my @refs = map { $get_mid->($_) } split /\s+/,
	($head->get('References') // "");
	my $irt = $get_mid->($head->get('In-Reply-To'));
	my $parent = $irt ? $irt : $refs[-1];

	# ...
	$mails{$mid} = {
	text => $text,
	entity => $entity,
	parent => $parent
	};

	}

	}

	sub get_text_plain {

	# Shouldn't there be a module for this?

	my $original = shift;
	my $e = $original;
	my @plain;

	if ($e->effective_type eq 'text/plain') {
	push @plain, $e;
	} elsif ($e->effective_type eq 'multipart/alternative') {
	push @plain,
	[ reverse grep { $_->effective_type eq 'text/plain' }
	$e->parts ]->[0]
	if $e->parts;
	} elsif ($e->effective_type eq 'multipart/mixed') {
	push @plain,
	grep { $_->effective_type eq 'text/plain' } $e->parts;
	} elsif ($e->effective_type eq 'multipart/signed') {
	push @plain,
	[ reverse grep { $_->effective_type eq 'text/plain' }
	$e->parts ]->[0]
	if $e->parts;
	} else {
	warn "Don't know how to extract plain text from "
	. $e->effective_type
	. " messages";
	}

	return join '', map {
	my $encoded = $_->bodyhandle->as_string;
	my $content_type = $_->get('Content-Type');
	if (defined $content_type) {
	my $field = Mail::Field->new('Content-Type', $content_type);
	my $charset = $field->charset;
	if (defined $charset) {
	my $decoded = Encode::decode($charset, $encoded, 1);
	$encoded = Encode::encode_utf8($decoded);
	}
	}
	$encoded;
	} @plain;
	}

	sub make_word_list {
	my $text = shift;
	my @words = $text =~ /(\w+\|\W)/gs;
	my @result;
	my $line = 1;
	my $column = 1;

	for (my $ix = 0; $ix < @words; ++$ix) {
	push @result, {
	word => $words[$ix],
	index => $ix,
	word_like => scalar($words[$ix] =~ /^\w+$/),
	line => $line,
	column => $column++,
	};
	if ($words[$ix] =~ /[\r\n]/) {
	$line++;
	$column = 1;
	}
	}
	return @result;
	}

	print q{

	<style>
	pre:nth-of-type(odd) { background-color: #eee }
	i { color: #ccc; font-style: normal }
	pre { white-space: pre-wrap }
	</style>

	};

	while (my ($mid, $obj) = each %mails) {

	my @child_words = make_word_list $obj->{text};

	next unless defined $obj->{parent};

	my $parent_obj = $mails{ $obj->{parent} };

	next unless defined $parent_obj;

	my @parent_words = make_word_list $parent_obj->{text};

	my @child_words_f =
	grep { $_->{word_like} \|\| $_->{word} !~ /[\s>]/ } @child_words;
	my @parent_words_f =
	grep { $_->{word_like} \|\| $_->{word} !~ /[\s>]/ } @parent_words;

	my @diff = Algorithm::Diff::sdiff \@parent_words_f,
	\@child_words_f, sub {
	return $_[0]->{word};
	};

	$_->{unmodified_in_diff} = 0 for @child_words;

	for (@diff) {
	next unless $_->[0] eq 'u';
	$child_words[ $_->[2]->{index} ]->{unmodified_in_diff} = 1;
	}

	$_->{quoted} = $_->{unmodified_in_diff} // 0 for @child_words;

	# If most characters on a line are not quoted, then assume that all
	# the tokens on that line are in fact not quoted. This is to handle
	# false positives that are generated due to overlap of frequent
	# words like "the" and "and" in english text. But do not do that in
	# some edge cases, which are approximated here as not doing it when
	# the line is less than the average line long.

	my %quoted_chars;
	my %unquoted_chars;
	my %chars_per_line;

	$quoted_chars{ $_->{line} } += length $_->{word}
	for grep { $_->{quoted} } @child_words;

	$unquoted_chars{ $_->{line} } += length $_->{word}
	for grep { !$_->{quoted} } @child_words;

	$chars_per_line{ $_->{line} } += length $_->{word}
	for @child_words;

	# ...
	my $median_cpl = 0 + median values %chars_per_line;
	my $average_cpl = 0 + avg values %chars_per_line;
	my $stddev_cpl = 0 + stddev values %chars_per_line;

	for (@child_words) {
	my $uc = $unquoted_chars{ $_->{line} } // 0;
	my $qc = $quoted_chars{ $_->{line} } // 0;
	my $cc = $chars_per_line{ $_->{line} };

	next if $qc >= $uc;
	next if $cc < $average_cpl;

	$_->{quoted} = 0;
	}

	my $mid_s = $1 if $mid =~ /<(.*)>/;

	printf "<pre><a href='http://mid.gmane.org/%s'>news:%s</a>\n\n",
	encode_entities(uri_escape($mid_s)), encode_entities($mid_s);

	for (@child_words) {
	if ($_->{quoted}) {
	print "<i>", encode_entities($_->{word}), "</i>";
	} else {
	print encode_entities($_->{word});
	}
	}
	print "</pre>";

	}