tvwerkhoven/worddistance.pl

## worddistance.pl
#!/usr/bin/perl -w
#
# Find the word distance for each word, and signal words that are close
# together. Can be useful for reviewing written text (papers, essays,
# whatever) to see if you might need a thesaurus somewhere.
#
# Tim van Werkoven, 20090426 <t.i.m.vanwerkhoven@xs4all.nl>
# This file is licensed under the Creative Commons Attribution-Share Alike
# license versions 3.0 or higher, see
# http://creativecommons.org/licenses/by-sa/3.0/

# If the inter-word distance is less than this, signal a warning
my $LIMIT = 10;
# Is the above count in characters (=0) or words (=1)?
my $MODE = 1;
# What is the minimum length of words to check for?
my $MINLEN = 4;
# What is a comment?
my $COMMCHAR = "%";


# Open the file
open(FILE, "<$ARGV[0]");

# keep track of the line number we're looking at
my $line=0;
my @history;
my $word="";

my @output;
my $hits=0;

#while (file) {
# get enough data into history
# start comparing word by word
# eject words if history becomes too long
#}
while (<FILE>) {
	$line++;
	chomp $_;

	# store current line
	my $curr = $_;
	# split line up in words
	my @words = split(/ /, $curr);
	# Check for comments
	next if /^$COMMCHAR/;

	# append words to history
	@history = (@history,@words);

	# check if we should ignore this
	# Start searching now
	# SEARCH WITH WORDLENGTH
	if ($MODE == 1) {
		while (scalar(@history) > $LIMIT) {
			# Take a word from the history
			$word = shift(@history);
			# Format it for easy checking
			my $word_f = lc($word);
			$word_f =~ s/\W|_//g;
			$i = 0;
			while ($i < $LIMIT && $history[$i]) {
				my $word_ch = lc($history[$i]);
				$word_ch =~ s/\W|_//g;
				#print "debug ".$word.$history[$i]."\n";
				if (length($word_f) < $MINLEN) {
					last;
				}
				elsif ($word_f eq $word_ch) {
					$output[$hits] = [($line, $word, $i)];
					$hits++;
					print "FOUND: $word (d: $i), line $line: ";
					print $word;
					for my $n (0 .. $i) {
						print " ".$history[$n];
					}
					print "\n";
					last;
				}
				$i++;
			}
		}
	}
	# SEARCH WITH CHARACTER LENGTH
	elsif ($MODE == 0) {
		print "This does not work yet :)";
	}
}
# File reach EOF, now parse the remaining history
while (scalar(@history) > 1) {
	# Take a word from the history
	$word = shift(@history);
	# Format it for easy checking
	my $word_f = lc($word);
	$word_f =~ s/\W|_//g;
	$i = 0;
	while ($i < $LIMIT && $history[$i]) {
		my $word_ch = lc($history[$i]);
		$word_ch =~ s/\W|_//g;
		if (length($word_f) < $MINLEN) {
			last;
		}
		if ($word_f eq $word_ch) {
			$output[$hits] = [($line, $word, $i)];
			$hits++;
			print "FOUND: $word (d: $i), line $line: ";
			print $word;
			for my $n (0 .. $i) {
				print " ".$history[$n];
			}
			print "\n";
			last;
		}
		$i++;
	}
}
	#!/usr/bin/perl -w
	#
	# Find the word distance for each word, and signal words that are close
	# together. Can be useful for reviewing written text (papers, essays,
	# whatever) to see if you might need a thesaurus somewhere.
	#
	# Tim van Werkoven, 20090426 <t.i.m.vanwerkhoven@xs4all.nl>
	# This file is licensed under the Creative Commons Attribution-Share Alike
	# license versions 3.0 or higher, see
	# http://creativecommons.org/licenses/by-sa/3.0/

	# If the inter-word distance is less than this, signal a warning
	my $LIMIT = 10;
	# Is the above count in characters (=0) or words (=1)?
	my $MODE = 1;
	# What is the minimum length of words to check for?
	my $MINLEN = 4;
	# What is a comment?
	my $COMMCHAR = "%";


	# Open the file
	open(FILE, "<$ARGV[0]");

	# keep track of the line number we're looking at
	my $line=0;
	my @history;
	my $word="";

	my @output;
	my $hits=0;

	#while (file) {
	# get enough data into history
	# start comparing word by word
	# eject words if history becomes too long
	#}
	while (<FILE>) {
	$line++;
	chomp $_;

	# store current line
	my $curr = $_;
	# split line up in words
	my @words = split(/ /, $curr);
	# Check for comments
	next if /^$COMMCHAR/;

	# append words to history
	@history = (@history,@words);

	# check if we should ignore this
	# Start searching now
	# SEARCH WITH WORDLENGTH
	if ($MODE == 1) {
	while (scalar(@history) > $LIMIT) {
	# Take a word from the history
	$word = shift(@history);
	# Format it for easy checking
	my $word_f = lc($word);
	$word_f =~ s/\W\|_//g;
	$i = 0;
	while ($i < $LIMIT && $history[$i]) {
	my $word_ch = lc($history[$i]);
	$word_ch =~ s/\W\|_//g;
	#print "debug ".$word.$history[$i]."\n";
	if (length($word_f) < $MINLEN) {
	last;
	}
	elsif ($word_f eq $word_ch) {
	$output[$hits] = [($line, $word, $i)];
	$hits++;
	print "FOUND: $word (d: $i), line $line: ";
	print $word;
	for my $n (0 .. $i) {
	print " ".$history[$n];
	}
	print "\n";
	last;
	}
	$i++;
	}
	}
	}
	# SEARCH WITH CHARACTER LENGTH
	elsif ($MODE == 0) {
	print "This does not work yet :)";
	}
	}
	# File reach EOF, now parse the remaining history
	while (scalar(@history) > 1) {
	# Take a word from the history
	$word = shift(@history);
	# Format it for easy checking
	my $word_f = lc($word);
	$word_f =~ s/\W\|_//g;
	$i = 0;
	while ($i < $LIMIT && $history[$i]) {
	my $word_ch = lc($history[$i]);
	$word_ch =~ s/\W\|_//g;
	if (length($word_f) < $MINLEN) {
	last;
	}
	if ($word_f eq $word_ch) {
	$output[$hits] = [($line, $word, $i)];
	$hits++;
	print "FOUND: $word (d: $i), line $line: ";
	print $word;
	for my $n (0 .. $i) {
	print " ".$history[$n];
	}
	print "\n";
	last;
	}
	$i++;
	}
	}