Created
July 11, 2016 21:38
-
-
Save lexszero/ddc9129c48102ba70d0e3cd3937a2340 to your computer and use it in GitHub Desktop.
hatebot_parser.pl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use Storable; | |
use utf8; | |
use encoding utf8; | |
%data = %{ retrieve('dict')} if (-e 'dict'); | |
print "Dict loaded\n"; | |
while (<STDIN>) { | |
@words = split(/ /, clean_string($_)); | |
$w1 = $words[0]; | |
$w2 = $words[1]; | |
for ($i=2; $i<=$#words; $i++) { | |
$w3 = $words[$i]; | |
push @{$data{$w1}{$w2}}, $w3; | |
$w1 = $w2; | |
$w2 = $w3; | |
} | |
} | |
store \%data, 'dict'; | |
sub clean_string () { | |
s/\n/\./; | |
s/\s+/ /g; | |
s/(^[\s ]*|[\s ]*$)//g; | |
s/["']//g; | |
s/([^\.!\?])$/\1./; | |
return $_; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment