pts/ladder.pl

## ladder.pl
#!perl -w
# let's learn pod!
=head1 NAME

ladder - Finds a word letter between two words

=head1 SYNOPSIS

B<ladder> I<word1> I<word2> [I<dictfile>]

=head1 DESCRIPTION

This is a solution for Expert Perl Quiz of the Week 22
(L<http://perl.plover.com/qotw/e/022>).

It builds a word ladder between two words given as arguments.

The program needs a I<sorted> plain text dictionary file to build the word ladder.
The name of this dictionary can be given as an optional third argument.
There is a default dictionary, of which the path can be set in the source.

=head1 SWITCHES

=over

=item B<-l>

Do not use capitalized words from the dictionary.
If this switch is not used, case differences are ignored.

=item B<-u>

Do not do the first-letter optimization.
The optimization should make the program run faster in most cases,
but when it makes it slower, you can disable it with this switch.
The optimization relies on that the dictionary is sorted (case-insensitively),
so when you use an unsorted dictionary, you should use this switch.

=item B<-w>

Spit out various debugging information, mostly useful for developping.

=back

=head1 IMPLEMENTATION

This program uses a very simple algorithm: a two-way breadth-first search.
Thus, we first read the dictionary.
Then we take the graph of words with the edges being those pair of words that
differ in one letter only.
Then we do a breadth-first search from both endpoints simultanously.
If we find a word that is reachable from both endpoints, we return success.
If the search from either endpoint ends, we return a failure.

I first wanted to implement a more complicated (and hopefully faster)
search algorithm, but when I implemented this one,
it turned out that the search itself is much faster than reading the dictionary,
so I rather optimized I/O.

Thus, the program now tries to read only those parts of the dictionary that start
with the same letter as one of the given words, and try to find a word ladder
among those.
Furthermore, there is yet one more level of optimization concerning the first
two letters of the words.

=head1 EXAMPLE

You run:

	perl ladder find lose

You get:

	find
	fine
	fise
	Lise
	lose

=head1 AUTHOR

L<ambrus@...>

=cut

# settings

our $default_dictname = "/home/ambrus/a/qotw/e22/words-b";

# 0. Declarations

use warnings; use strict;
use IO::Handle;

# 1. Parse argv

our($dictname, $noproper, $word1o, $word2o, $debug, $unoptimize, $unoptimize2);
{
	use Getopt::Std;
	getopts "luUw", \my %switch;
	($noproper, $debug, $unoptimize, $unoptimize2) = @switch{qw(l w u U)};
	2<=@ARGV && @ARGV<=3 or die "Usage: ladder -d dictfile word1 word2";
	($word1o, $word2o, $dictname) = @ARGV;
	defined($dictname) or $dictname = $default_dictname;
}

# 2. Locale-dependent functions

{
use locale;
my $re = qr/\A[[:alpha:]]+\z/;
$noproper and $re = qr/\A[[:lower:]]+\z/;
sub checkword {
	my($w) = @_;
	$w=~$re ? 1 : 0;
}
# You might want to treat some letters the same (especially in French).
# In that case, change the following function accordingly.  The function must
# not change the lenght of the word or else the program will break.
sub normalize {
	lc $_[0];
}
}

# 3. Do some simple checking on the words

our($length, $word1, $word2);

sub addword;
do {
	checkword($word1o) && checkword($word2o) or
		die "one of the given words contain illegal characters";
	length($word1o)==length($word2o) or
		die "it is impossible to build a word ladder, as the two words are of different length\n";
	$length = length($word1o);
	$word1 = normalize($word1o);
	addword $word1, $word1o;
	$word2 = normalize($word2o);
	addword $word2, $word2o;
	$word1 eq $word2 and do {
		print $word1, "\n";
		exit 0;
		};
};

# 4. Read and process the dictionary
# this should be really simple, but the code somehow became a mess.

our %word; # this hash contains the original forms of a word
our $dict; # a filehandle

sub addword;
do {
	open $dict, "<", $dictname or die qq{cannot open dictionary file "$dictname": $!};
	$debug and warn qq{dictionary file "$dictname" open};
};

sub addword { # $w should be eq normalize($o)
	my($w, $o) = @_;
	defined($word{$w}) or
		$word{$w} = $o;
	}

sub readdict {
	no locale;
	my($until) = @_;
	my($w, $l);
	$! = 0;
	while (defined($l = <$dict>)) {
		chomp $l;
		length($l)==$length or next;
		checkword $l or next;
		$w = normalize($l);
		$w lt $until or last;
		addword $w, $l;
		}
	error $dict and die qq{error readig dictionary file "$dictname": $!}; # see "http://www.perlmonks.com/?node_id=386996"
	$debug and warn "there are ", (0+keys(%word)), " words in the hash now";
}

# 5. Optimization: not to read the whole word list unless neccessary

our $phase;
sub seekpart; sub makepatts; sub dosearch; sub seekpart;

sub readpart {
	my($prefix) = @_;
	defined($prefix) or $prefix = "";
	seekpart $prefix;
	readdict $prefix . "\xff\xff\xff"; # yes, this is a kludge, but I hope no word will have a triple \xff in it.
}

sub seekpart {
	my($prefix) = @_;
	my($l, $w);
	my($a, $b) = (0, -s $dict);
	defined($b) or die qq{error fstating the dictionary file "$dictname": $!};
	until ($b-$a<=2048) {
		my $c = int(($a + $b)/2);
		seek $dict, $c, 0 or
			die qq{error seeking the dictionary file "$dictname" to $c during bsearch: $!};
		$! = 0;
		$l = <$dict>; # get aligned to a newline
		chomp($l = <$dict>) until
			checkword $l or !defined($l);
		error $dict and
			die qq{error reading the dictionary file "$dictname" during bsearch: $!};
		$w = defined($l) ? normalize($l) : ("\xff" x 6);
		($w gt $prefix) ? $b : $a = $c;
	}
	$debug and warn qq{bsearch done ("$prefix" $a)};
	seek $dict, $a, 0 or
		die qq{error seeking the dictionary file "dictname" to $a, during bsearch, final: $!};
	$a>0 and do { # IMPORTANT!
		<$dict>;
		error $dict and
			die qq{error reading the dictionary file "$dictname" during bsearch, final align: $!}
		};
}

our %readpart;
sub condreadpart {
	my($l) = @_;
	unless ($readpart{$l}) {
		$debug and warn qq{processing prefix "$l" from the dictionary};
		readpart $l;
		$readpart{$l} = 1;
	}
}

ALL: {
	if (!$unoptimize) {
		if (!$unoptimize2) {
			$phase = 2;
			condreadpart substr($word1, 0, 2);
			condreadpart substr($word2, 0, 2);
			condreadpart substr($word1, 0, 1) . substr($word2, 1, 1);
			condreadpart substr($word2, 0, 1) . substr($word1, 1, 1);
			makepatts;
			dosearch;
		}
		$phase = 1;
		condreadpart substr($word1, 0, 1);
		condreadpart substr($word2, 0, 1);
		makepatts;
		dosearch;
	}
	$phase = 0;
	$debug and warn "processing the whole dictionary";
	readpart "";
	makepatts;
	close $dict or die qq{error closing dictionary file "$dictname": $!};
	dosearch;
}

# 6. Create helper table so that we can find adjacent words faster

our %patt;

sub makepatts {
	%patt = ();
	for my $w (keys %word) {
		for my $n (0..length($w)-1) {
			my $p = substr($w, 0, $n) . "." . substr($w, $n+1);
			push @{$patt{$p}}, $w;
		}
	}
	$debug and warn "there are ", (0+keys(%word)), " patterns in the hash now";
}

# 7. Do the actual search

our @qu; # the queue needed for the bsearch
our %seen; # what nodes were seen in the bsearch and from where
sub visit; sub visit1; sub success; sub neighbours; sub failure;

sub cleansearch {
	%seen = ();
}

sub dosearch {
	PHASE: {
	cleansearch;
	@qu = (
		[$word1, 1, undef],
		1,
		[$word2, 2, undef],
		2,
		);
	while (my $e = shift @qu) {
		if (ref $e) {
			visit @$e;
		} else {
			push @qu, $e;
			!ref($qu[0]) and failure ($e);
			$debug and warn "$e\n";
		}
		}
	die "internal error: the markers have somehow disappeared";
	}
}

sub visit {
	my($w, $dir, $from) = @_;
	my $s = $seen{$w}[0];
	$s && $s!=$dir and
		success $w, $dir, $from;
	$s and return;
	$seen{$w} = [$dir, $from];
	neighbours $w, sub
		{ push @qu, [$_[0], $dir, $w]; };
}

# 8. Find the neighbours of a given word

sub neighbours {
	my($w, $cb) = @_;
	for my $n (0..length($w)-1) {
		my $p = substr($w, 0, $n) . "." . substr($w, $n+1);
		@{$patt{$p}}<=1 and next;
		for my $x (@{$patt{$p}}) {
			&$cb($x) unless $x eq $w;
		}
	}
}


# 9. Output the result, either word ladder or failure

sub failure {
	$debug and warn "phase $phase failed, word$_[0] was more isolated,";
	$phase or do {
		die "no word ladder found\n";
	};
	no warnings "exiting";
	next PHASE; # :-)
}

sub printladder1; sub printladder2;
sub success {
	$debug and warn "about to print solution";
	my($pn, $dir, $po) = @_;
	my($p1, $p2) = $dir==1 ? ($po, $pn) : ($pn, $po);
	printladder1 $p1;
	printladder2 $p2;
	no warnings "exiting";
	last ALL;
}

sub printladder1 {
	my $r = $word{$_[0]};
	my $n = $seen{$_[0]}[1];
	$n and printladder1 $n;
	print $r, "\n";
}
sub printladder2 {
	my $r = $word{$_[0]};
	my $n = $seen{$_[0]}[1];
	print $r, "\n";
	$n and printladder2 $n;
}

__END__
	#!perl -w
	# let's learn pod!
	=head1 NAME

	ladder - Finds a word letter between two words

	=head1 SYNOPSIS

	B<ladder> I<word1> I<word2> [I<dictfile>]

	=head1 DESCRIPTION

	This is a solution for Expert Perl Quiz of the Week 22
	(L<http://perl.plover.com/qotw/e/022>).

	It builds a word ladder between two words given as arguments.

	The program needs a I<sorted> plain text dictionary file to build the word ladder.
	The name of this dictionary can be given as an optional third argument.
	There is a default dictionary, of which the path can be set in the source.

	=head1 SWITCHES

	=over

	=item B<-l>

	Do not use capitalized words from the dictionary.
	If this switch is not used, case differences are ignored.

	=item B<-u>

	Do not do the first-letter optimization.
	The optimization should make the program run faster in most cases,
	but when it makes it slower, you can disable it with this switch.
	The optimization relies on that the dictionary is sorted (case-insensitively),
	so when you use an unsorted dictionary, you should use this switch.

	=item B<-w>

	Spit out various debugging information, mostly useful for developping.

	=back

	=head1 IMPLEMENTATION

	This program uses a very simple algorithm: a two-way breadth-first search.
	Thus, we first read the dictionary.
	Then we take the graph of words with the edges being those pair of words that
	differ in one letter only.
	Then we do a breadth-first search from both endpoints simultanously.
	If we find a word that is reachable from both endpoints, we return success.
	If the search from either endpoint ends, we return a failure.

	I first wanted to implement a more complicated (and hopefully faster)
	search algorithm, but when I implemented this one,
	it turned out that the search itself is much faster than reading the dictionary,
	so I rather optimized I/O.

	Thus, the program now tries to read only those parts of the dictionary that start
	with the same letter as one of the given words, and try to find a word ladder
	among those.
	Furthermore, there is yet one more level of optimization concerning the first
	two letters of the words.

	=head1 EXAMPLE

	You run:

	perl ladder find lose

	You get:

	find
	fine
	fise
	Lise
	lose

	=head1 AUTHOR

	L<ambrus@...>

	=cut

	# settings

	our $default_dictname = "/home/ambrus/a/qotw/e22/words-b";

	# 0. Declarations

	use warnings; use strict;
	use IO::Handle;

	# 1. Parse argv

	our($dictname, $noproper, $word1o, $word2o, $debug, $unoptimize, $unoptimize2);
	{
	use Getopt::Std;
	getopts "luUw", \my %switch;
	($noproper, $debug, $unoptimize, $unoptimize2) = @switch{qw(l w u U)};
	2<=@ARGV && @ARGV<=3 or die "Usage: ladder -d dictfile word1 word2";
	($word1o, $word2o, $dictname) = @ARGV;
	defined($dictname) or $dictname = $default_dictname;
	}

	# 2. Locale-dependent functions

	{
	use locale;
	my $re = qr/\A[[:alpha:]]+\z/;
	$noproper and $re = qr/\A[[:lower:]]+\z/;
	sub checkword {
	my($w) = @_;
	$w=~$re ? 1 : 0;
	}
	# You might want to treat some letters the same (especially in French).
	# In that case, change the following function accordingly. The function must
	# not change the lenght of the word or else the program will break.
	sub normalize {
	lc $_[0];
	}
	}

	# 3. Do some simple checking on the words

	our($length, $word1, $word2);

	sub addword;
	do {
	checkword($word1o) && checkword($word2o) or
	die "one of the given words contain illegal characters";
	length($word1o)==length($word2o) or
	die "it is impossible to build a word ladder, as the two words are of different length\n";
	$length = length($word1o);
	$word1 = normalize($word1o);
	addword $word1, $word1o;
	$word2 = normalize($word2o);
	addword $word2, $word2o;
	$word1 eq $word2 and do {
	print $word1, "\n";
	exit 0;
	};
	};

	# 4. Read and process the dictionary
	# this should be really simple, but the code somehow became a mess.

	our %word; # this hash contains the original forms of a word
	our $dict; # a filehandle

	sub addword;
	do {
	open $dict, "<", $dictname or die qq{cannot open dictionary file "$dictname": $!};
	$debug and warn qq{dictionary file "$dictname" open};
	};

	sub addword { # $w should be eq normalize($o)
	my($w, $o) = @_;
	defined($word{$w}) or
	$word{$w} = $o;
	}

	sub readdict {
	no locale;
	my($until) = @_;
	my($w, $l);
	$! = 0;
	while (defined($l = <$dict>)) {
	chomp $l;
	length($l)==$length or next;
	checkword $l or next;
	$w = normalize($l);
	$w lt $until or last;
	addword $w, $l;
	}
	error $dict and die qq{error readig dictionary file "$dictname": $!}; # see "http://www.perlmonks.com/?node_id=386996"
	$debug and warn "there are ", (0+keys(%word)), " words in the hash now";
	}

	# 5. Optimization: not to read the whole word list unless neccessary

	our $phase;
	sub seekpart; sub makepatts; sub dosearch; sub seekpart;

	sub readpart {
	my($prefix) = @_;
	defined($prefix) or $prefix = "";
	seekpart $prefix;
	readdict $prefix . "\xff\xff\xff"; # yes, this is a kludge, but I hope no word will have a triple \xff in it.
	}

	sub seekpart {
	my($prefix) = @_;
	my($l, $w);
	my($a, $b) = (0, -s $dict);
	defined($b) or die qq{error fstating the dictionary file "$dictname": $!};
	until ($b-$a<=2048) {
	my $c = int(($a + $b)/2);
	seek $dict, $c, 0 or
	die qq{error seeking the dictionary file "$dictname" to $c during bsearch: $!};
	$! = 0;
	$l = <$dict>; # get aligned to a newline
	chomp($l = <$dict>) until
	checkword $l or !defined($l);
	error $dict and
	die qq{error reading the dictionary file "$dictname" during bsearch: $!};
	$w = defined($l) ? normalize($l) : ("\xff" x 6);
	($w gt $prefix) ? $b : $a = $c;
	}
	$debug and warn qq{bsearch done ("$prefix" $a)};
	seek $dict, $a, 0 or
	die qq{error seeking the dictionary file "dictname" to $a, during bsearch, final: $!};
	$a>0 and do { # IMPORTANT!
	<$dict>;
	error $dict and
	die qq{error reading the dictionary file "$dictname" during bsearch, final align: $!}
	};
	}

	our %readpart;
	sub condreadpart {
	my($l) = @_;
	unless ($readpart{$l}) {
	$debug and warn qq{processing prefix "$l" from the dictionary};
	readpart $l;
	$readpart{$l} = 1;
	}
	}

	ALL: {
	if (!$unoptimize) {
	if (!$unoptimize2) {
	$phase = 2;
	condreadpart substr($word1, 0, 2);
	condreadpart substr($word2, 0, 2);
	condreadpart substr($word1, 0, 1) . substr($word2, 1, 1);
	condreadpart substr($word2, 0, 1) . substr($word1, 1, 1);
	makepatts;
	dosearch;
	}
	$phase = 1;
	condreadpart substr($word1, 0, 1);
	condreadpart substr($word2, 0, 1);
	makepatts;
	dosearch;
	}
	$phase = 0;
	$debug and warn "processing the whole dictionary";
	readpart "";
	makepatts;
	close $dict or die qq{error closing dictionary file "$dictname": $!};
	dosearch;
	}

	# 6. Create helper table so that we can find adjacent words faster

	our %patt;

	sub makepatts {
	%patt = ();
	for my $w (keys %word) {
	for my $n (0..length($w)-1) {
	my $p = substr($w, 0, $n) . "." . substr($w, $n+1);
	push @{$patt{$p}}, $w;
	}
	}
	$debug and warn "there are ", (0+keys(%word)), " patterns in the hash now";
	}

	# 7. Do the actual search

	our @qu; # the queue needed for the bsearch
	our %seen; # what nodes were seen in the bsearch and from where
	sub visit; sub visit1; sub success; sub neighbours; sub failure;

	sub cleansearch {
	%seen = ();
	}

	sub dosearch {
	PHASE: {
	cleansearch;
	@qu = (
	[$word1, 1, undef],
	1,
	[$word2, 2, undef],
	2,
	);
	while (my $e = shift @qu) {
	if (ref $e) {
	visit @$e;
	} else {
	push @qu, $e;
	!ref($qu[0]) and failure ($e);
	$debug and warn "$e\n";
	}
	}
	die "internal error: the markers have somehow disappeared";
	}
	}

	sub visit {
	my($w, $dir, $from) = @_;
	my $s = $seen{$w}[0];
	$s && $s!=$dir and
	success $w, $dir, $from;
	$s and return;
	$seen{$w} = [$dir, $from];
	neighbours $w, sub
	{ push @qu, [$_[0], $dir, $w]; };
	}

	# 8. Find the neighbours of a given word

	sub neighbours {
	my($w, $cb) = @_;
	for my $n (0..length($w)-1) {
	my $p = substr($w, 0, $n) . "." . substr($w, $n+1);
	@{$patt{$p}}<=1 and next;
	for my $x (@{$patt{$p}}) {
	&$cb($x) unless $x eq $w;
	}
	}
	}


	# 9. Output the result, either word ladder or failure

	sub failure {
	$debug and warn "phase $phase failed, word$_[0] was more isolated,";
	$phase or do {
	die "no word ladder found\n";
	};
	no warnings "exiting";
	next PHASE; # :-)
	}

	sub printladder1; sub printladder2;
	sub success {
	$debug and warn "about to print solution";
	my($pn, $dir, $po) = @_;
	my($p1, $p2) = $dir==1 ? ($po, $pn) : ($pn, $po);
	printladder1 $p1;
	printladder2 $p2;
	no warnings "exiting";
	last ALL;
	}

	sub printladder1 {
	my $r = $word{$_[0]};
	my $n = $seen{$_[0]}[1];
	$n and printladder1 $n;
	print $r, "\n";
	}
	sub printladder2 {
	my $r = $word{$_[0]};
	my $n = $seen{$_[0]}[1];
	print $r, "\n";
	$n and printladder2 $n;
	}

	__END__