Skip to content

Instantly share code, notes, and snippets.

@ag4ve
Created January 4, 2016 23:07
Show Gist options
  • Save ag4ve/cef23a2c48c38950dcc2 to your computer and use it in GitHub Desktop.
Save ag4ve/cef23a2c48c38950dcc2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use Digest::MD5 qw(md5_base64);
use Text::CSV;
my $csv = Text::CSV->new({binary => 1, always_quote => 1})
or die "Can not use CSV: " . Text::CSV->error_diag();
my $filein = $ARGV[0];
my $lookupout = $ARGV[1] // 'lookup.csv';
my $scoreout = $ARGV[2] // 'out.csv';
open (my $fh, '<:encoding(utf8)', $filein)
or die "Can not open [$filein] $!";
my $rows = $csv->getline_all($fh);
$csv->eof or $csv->error_diag();
close $fh;
my ($score, $lookup);
foreach my $i (0 .. $#{$rows}) {
my $row = $rows->[$i];
my ($hash, $scr, $replace);
if ($i == 0) {
$hash = "Hash";
$scr = "Score";
} else {
$hash = md5_base64(rand);
$scr = "";
$replace = 1;
}
my $content = [ @{$row}[5 .. $#{$row}] ];
if ($replace) {
my @words =
grep {defined($_) and length($_) > 2} # base length
map { # grab words
my $sub = ($_
=~ s/[\n\r\t]+/ /r # no new lines or tabs
=~ s/[<>]//r # remove '<' and '>'
=~ s/([\.\+])/\\$1/r # escape '.' and '+'
);
split(/[ @]/, $sub) # split at '@' and ' '
} @{$row}[1 .. 4]; # do name and email columns
@words =
keys %{+{ map {$_ => 1} @words }}; # dedupe
my $words_re = join('|', @words);
@$content = map {$_ =~ s/((?:$words_re) )+/<replaced> /gir} @$content;
}
push @$lookup, [$hash, @{$row}[1 .. 4]];
push @$score, [$scr, $hash, $row->[0], @$content];
}
$csv->eol("\r\n");
foreach my $out ([$lookupout, $lookup], [$scoreout, $score]) {
open (my $fh, '>:encoding(utf8)', $out->[0])
or die "Can not write [" . $out->[0] . "] $!";
map {$csv->print(\*$fh, $_)} @{$out->[1]};
close $fh;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment