Skip to content

Instantly share code, notes, and snippets.

@sn1p3r46
Last active June 26, 2016 18:11
Show Gist options
  • Save sn1p3r46/1b839da1ce8065ef85b6d8dd2fc324d2 to your computer and use it in GitHub Desktop.
Save sn1p3r46/1b839da1ce8065ef85b6d8dd2fc324d2 to your computer and use it in GitHub Desktop.
TEXT PREPROCESSING WITH PERL
#!/usr/bin/perl
my @myLangs = qw( de it fr );
# STOP WORDS FOLDER
my $stp = "/home/andrea/Downloads/DataMining/code/data/";
# STOP WORDS FILE
my $stf = "/stopwords.txt";
# DATA MAIN PATH
my $dp = "/home/andrea/Downloads/DataMining/code/dwld/Gutenberg/dwdl/stripped/";
# SOURCE SUB-DIR
my $sf = "/noHeaders/";
# DEST SUB-DIR
my $ef = "/ready/";
foreach (@myLangs) {
# OPEN STOP WORDS FILE 4 EACH LANGUAGE
open(STOP,"<",$stp.$_.$stf) || print "Can't read tmp file ".$stp.$_.$stf.": $!";
my @stopWords = <STOP>;
close (STOP);
# CLEAN STOPWORDS IF NEEDED
chomp @stopWords;
my $stopRegex = join '|', @stopWords;
# GETS ALL FILE NAMES 4 EACH LANGUAGE
opendir my $dir, $dp.$_.$sf or die "Cannot open directory:".$stp.$_.$sf." $!";
my @files = readdir $dir;
closedir $dir;
my $lang = $_;
foreach (@files) {
if (!/^\./){
print $dp.$lang.$sf.$_,"\n";
print $dp.$lang.$ef.$_,"\n";
open(IN, $dp.$lang.$sf.$_) || die " could not open file: ".$dp.$lang.$sf.$_." $! \n";
open(my $OUT, ">", $dp.$lang.$ef.$_) || die " could not open file.. ".$dp.$lang.$ef.$_." $! \n";
#FILE PROCESSING
while (<IN>) {
# REMOVES PUNCTUATION AND STOPWORDS
s/\t|[[:punct:]]|\b(?:$stopRegex)\b//ig;
# REDUCES REDUNDANT SPACES #s/(\R)(?:\h*\R)+/$1$1/g;
s/( )+/ /g;
# IGNORE BLANK LINES AND WRITE IN LOWERCASE TO FILE
print $OUT lc $_ if (!/^$/) && (!/^\s*$/);
}
close(IN);
close($OUT);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment