#!/usr/bin/perl -w
#
# Find the word distance for each word, and signal words that are close 
# together. Can be useful for reviewing written text (papers, essays, 
# whatever) to see if you might need a thesaurus somewhere.
# 
# Tim van Werkoven, 20090426 <t.i.m.vanwerkhoven@xs4all.nl>
# This file is licensed under the Creative Commons Attribution-Share Alike
# license versions 3.0 or higher, see
# http://creativecommons.org/licenses/by-sa/3.0/

# If the inter-word distance is less than this, signal a warning
my $LIMIT = 10;
# Is the above count in characters (=0) or words (=1)?
my $MODE = 1;
# What is the minimum length of words to check for?
my $MINLEN = 4;
# What is a comment?
my $COMMCHAR = "%";


# Open the file
open(FILE, "<$ARGV[0]");

# keep track of the line number we're looking at
my $line=0;
my @history;
my $word="";

my @output;
my $hits=0;

#while (file) {
# get enough data into history
# start comparing word by word
# eject words if history becomes too long
#}
while (<FILE>) {
        $line++;
        chomp $_;

        # store current line
        my $curr = $_;
        # split line up in words
        my @words = split(/ /, $curr);          
        # Check for comments
        next if /^$COMMCHAR/;

        # append words to history
        @history = (@history,@words);

        # check if we should ignore this
        # Start searching now
        # SEARCH WITH WORDLENGTH
        if ($MODE == 1) {
                while (scalar(@history) > $LIMIT) {
                        # Take a word from the history
                        $word = shift(@history);
                        # Format it for easy checking
                        my $word_f = lc($word);
                        $word_f =~ s/\W|_//g;
                        $i = 0;
                        while ($i < $LIMIT && $history[$i]) {
                                my $word_ch = lc($history[$i]);
                                $word_ch =~ s/\W|_//g;
                                #print "debug ".$word.$history[$i]."\n";
                                if (length($word_f) < $MINLEN) {
                                        last;
                                }
                                elsif ($word_f eq $word_ch) {
                                        $output[$hits] = [($line, $word, $i)];
                                        $hits++;
                                        print "FOUND: $word (d: $i), line $line: ";
                                        print $word;
                                        for my $n (0 .. $i) {
                                                print " ".$history[$n];
                                        }
                                        print "\n";
                                        last;
                                }
                                $i++;
                        }
                }
        }
        # SEARCH WITH CHARACTER LENGTH
        elsif ($MODE == 0) { 
                print "This does not work yet :)";
        }
}
# File reach EOF, now parse the remaining history
while (scalar(@history) > 1) {
        # Take a word from the history
        $word = shift(@history);
        # Format it for easy checking
        my $word_f = lc($word);
        $word_f =~ s/\W|_//g;
        $i = 0;
        while ($i < $LIMIT && $history[$i]) {
                my $word_ch = lc($history[$i]);
                $word_ch =~ s/\W|_//g;
                if (length($word_f) < $MINLEN) {
                        last;
                }
                if ($word_f eq $word_ch) {
                        $output[$hits] = [($line, $word, $i)];
                        $hits++;
                        print "FOUND: $word (d: $i), line $line: ";
                        print $word;
                        for my $n (0 .. $i) {
                                print " ".$history[$n];
                        }
                        print "\n";
                        last;
                }
                $i++;
        }
}

See worddistance.pl @ github.