Skip to content

Instantly share code, notes, and snippets.

@tvwerkhoven
Created April 5, 2010 20:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tvwerkhoven/356799 to your computer and use it in GitHub Desktop.
Save tvwerkhoven/356799 to your computer and use it in GitHub Desktop.
Find the word distance for each word, and signal words that are close together
#!/usr/bin/perl -w
#
# Find the word distance for each word, and signal words that are close
# together. Can be useful for reviewing written text (papers, essays,
# whatever) to see if you might need a thesaurus somewhere.
#
# Tim van Werkoven, 20090426 <t.i.m.vanwerkhoven@xs4all.nl>
# This file is licensed under the Creative Commons Attribution-Share Alike
# license versions 3.0 or higher, see
# http://creativecommons.org/licenses/by-sa/3.0/
# If the inter-word distance is less than this, signal a warning
my $LIMIT = 10;
# Is the above count in characters (=0) or words (=1)?
my $MODE = 1;
# What is the minimum length of words to check for?
my $MINLEN = 4;
# What is a comment?
my $COMMCHAR = "%";
# Open the file
open(FILE, "<$ARGV[0]");
# keep track of the line number we're looking at
my $line=0;
my @history;
my $word="";
my @output;
my $hits=0;
#while (file) {
# get enough data into history
# start comparing word by word
# eject words if history becomes too long
#}
while (<FILE>) {
$line++;
chomp $_;
# store current line
my $curr = $_;
# split line up in words
my @words = split(/ /, $curr);
# Check for comments
next if /^$COMMCHAR/;
# append words to history
@history = (@history,@words);
# check if we should ignore this
# Start searching now
# SEARCH WITH WORDLENGTH
if ($MODE == 1) {
while (scalar(@history) > $LIMIT) {
# Take a word from the history
$word = shift(@history);
# Format it for easy checking
my $word_f = lc($word);
$word_f =~ s/\W|_//g;
$i = 0;
while ($i < $LIMIT && $history[$i]) {
my $word_ch = lc($history[$i]);
$word_ch =~ s/\W|_//g;
#print "debug ".$word.$history[$i]."\n";
if (length($word_f) < $MINLEN) {
last;
}
elsif ($word_f eq $word_ch) {
$output[$hits] = [($line, $word, $i)];
$hits++;
print "FOUND: $word (d: $i), line $line: ";
print $word;
for my $n (0 .. $i) {
print " ".$history[$n];
}
print "\n";
last;
}
$i++;
}
}
}
# SEARCH WITH CHARACTER LENGTH
elsif ($MODE == 0) {
print "This does not work yet :)";
}
}
# File reach EOF, now parse the remaining history
while (scalar(@history) > 1) {
# Take a word from the history
$word = shift(@history);
# Format it for easy checking
my $word_f = lc($word);
$word_f =~ s/\W|_//g;
$i = 0;
while ($i < $LIMIT && $history[$i]) {
my $word_ch = lc($history[$i]);
$word_ch =~ s/\W|_//g;
if (length($word_f) < $MINLEN) {
last;
}
if ($word_f eq $word_ch) {
$output[$hits] = [($line, $word, $i)];
$hits++;
print "FOUND: $word (d: $i), line $line: ";
print $word;
for my $n (0 .. $i) {
print " ".$history[$n];
}
print "\n";
last;
}
$i++;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment