meehow/anagram.c

## anagram.c
/* Anagram: a little filter that emits anagrams of its argument.
 *
 * This program's main point of interest is its speed, which it
 * obtains by using native FPU operations to do the anagram tests.
 *
 * The basic idea is to form a one-to-one mapping of letter values to prime
 * numbers, allowing words to be represented as composite numbers by
 * multiplying together the primes that map each letter in the word.
 * Words formed from the same letters, regardless of order, will then map
 * to the same composite number.  Also, if one word's number divides exactly
 * into another word's number, the first word's letters must all appear in
 * the second word.
 *
 *
 * Copyright 2004 Stephen Thomas
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 *
 * Modification history:
 *
 * First release Feb 14 2004, Stephen Thomas (flabdablet yahoo com)
 * Compiles cleanly for me on a Pentium 4 running Linux with gcc 3.2.2:
 * gcc -O3 -fomit-frame-pointer -march=i686 -Wall -o anagram anagram.c
 * Please let me know if you find it useful or at least interesting,
 * and send me a copy if you release something based on it.
 */

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include <errno.h>

#define READSIZE 4096
#define DOS_TEXT_FORMAT 0

int main (int argc, char **argv)
{
    static const float primes[] = {
           2,   3,   5,   7,  11,
          13,  17,  19,  23,  29,
          31,  37,  41,  43,  47,
	  53,  59,  61,  67,  71,
	  73,  79,  83,  98,  97,
	 101, 103, 107, 109, 113,
	 127, 131, 137, 139, 149
    };
    static float factors[512];

    static unsigned char buffer[2][READSIZE];
    ssize_t count;
    int ch;
    unsigned char *buf_p;
    unsigned char *word_p;

    int index, exit_status, length, min_length;
    long double letters_key, word_key, needed_key, inexact;

    /*  If no arguments are given, display usage message    */

    if (argc < 2) {
        fprintf (
            stderr,
            "Usage: %s LETTERS [LENGTH] [NEEDED]\n"
            "Try `%s --help' for more information.\n",
            argv[0], argv[0]
        );
        exit (0);
    }

    /*  Display more detailed help if the single argument
        is "--help"     */

    if (argc == 2 && argv[1] != NULL && strcmp (argv[1], "--help") == 0) {
        fprintf (
            stderr,
            "Usage: %s LETTERS [LENGTH] [NEEDED]\n"
            "Search standard input for all words that can be formed from\n"
            "at least LENGTH of LETTERS, including all NEEDED.\n"
            "Example: %s octopus 5 oo </usr/dict/words\n"
            "\n"
            "If NEEDED is not given, find words that can be formed from\n"
            "at least LENGTH of LETTERS.  If LENGTH is not given, find\n"
            "words that can be formed from all of LETTERS.  Standard input\n"
            "is expected to be a list of words, one per line.  Exit status\n"
            "is 0 if match, 1 if no match, and 2 if trouble.\n"
            "\n"
            "Report bugs to flabdablet, yahoo, com.\n",
            argv[0], argv[0]
        );
        exit (0);
    }

    /*  Find the smallest floating point number that would cause
        rounding errors when used as a large integer.   */

    inexact = 1;
    while (inexact + 1 != inexact) {
        inexact *= 2;
    }

    /*  The first argument specifies the list of letters to select anagrams
        from.  Build a table of factors containing a unique prime for each
        letter in the argument, and multiply together the factors corres-
        ponding to each letter to build a composite number in letters_key
        that represents the entire list.

        Allocate primes from smallest to largest in an effort to keep their
        product small.  This allows reasonably large letter lists to be
        represented within the limits of the long double numeric format.   */

    index = 0;
    letters_key = 1;
    length = 0;
    memset (factors, 0, sizeof factors);
    if (DOS_TEXT_FORMAT) factors['\r'] = 1;

    if (argc > 1 && argv[1] != NULL) {
        while ((ch = argv[1][length++]) != 0) {

            /*  If letter is new, allocate a corresponding prime factor */

            if (factors[ch] == 0 && index < sizeof primes / sizeof *primes) {
                factors[ch] =
                factors[tolower(ch)] =
                factors[toupper(ch)] = primes[index++];
            }

            /*  Form composite number representing entire letter list   */

            letters_key *= factors[ch];
        }

        /*  If the primes table was exhausted or letters_key is too
            big to avoid rounding errors, the search results will be
            meaningless.    */

        if (letters_key == 0 || letters_key >= inexact) {
            fprintf (
                stderr,
                "%s: Can't handle such a big letter set\n",
                argv[0]
            );
            exit (2);
        }
    }

    /*  Allow a minimum length to be specified, so partial anagrams can
        be generated (useful for newspaper anagram puzzles).  The values
        of the length variables include word-termination characters, so
        user-specified values must be adjusted.

        If no minimum length is specified, anagrams will contain all the
        letters in the first argument.  */

    min_length = length;
    if (argc > 2 && argv[2] != NULL) {
        min_length = atoi (argv[2]) + 1;
        if (min_length > length) {
            fprintf (
                stderr,
                "%s: Minimum length longer than letter set\n",
                argv[0]
            );
            exit (2);
        }
    }
    if (DOS_TEXT_FORMAT) ++min_length;

    /*  If a minimum length was specified, also allow specification of
        a list of letters that must all be present in each anagram.  This
        list is represented as a composite number, needed_key, consisting
        of the product of all the factors corresponding to the letters
        listed.     */

    needed_key = 1;
    length = 0;
    if (argc > 3 && argv[3] != NULL) {
        while ((ch = argv[3][length++]) != 0) {
            needed_key *= factors[ch];
        }

        /*  If any letters occur that weren't included in argv[1],
            their corresponding factors will be zero, and so will
            needed_key.  If there are more occurrences of a given
            letter than there were in argv[1], needed_key will not
            divide exactly into letters_key. */

        if (needed_key == 0 || fmodl (letters_key, needed_key) != 0) {
            fprintf (
                stderr,
                "%s: Needed letters not in letter set\n",
                argv[0]
            );
            exit (2);
        }
    }

    /*  Get set up for first input word   */

    word_p = buffer[1];
    word_key = 1;
    exit_status = 1;

    /*  Read next chunk of input, using the low-level read() function
        for speed   */

    while ((count = read (0, buffer[1], READSIZE)) > 0) {

    /*  Scan the input buffer, looking for the newlines that mark
        the ends of words   */

        buf_p = buffer[1];
        do {
            if ((ch = *buf_p++) != '\n') {

                /*  Each non-newline gets its corresponding prime
                    factor multiplied into word_key, the composite
                    number that represents all the letters in the
                    current input word.  This is all the processing
                    needed for most characters in the input buffer. */

                word_key *= factors[ch];
            }
            else {

                /*  End of word found: if word contains only letters
                    from argv[1] and is long enough and has no more
                    repeats of those letters than occur in argv[1]
                    and contains all the letters from argv[3],
                    write it out and remember an anagram was found. */

                if (word_key > 0 &&
                    buf_p - word_p >= min_length &&
                    fmodl (letters_key, word_key) == 0 &&
                    fmodl (word_key, needed_key) == 0
                ) {
                    fwrite (word_p, buf_p - word_p, 1, stdout);
                    exit_status = 0;
                }

                /*  Get set up for next input word    */

                word_p = buf_p;
                word_key = 1;
            }
        }
        while (--count);

        /*  After processing the whole buffer, transfer any leftover
            word parts to a spot just before the start of the buffer
            in case they need outputting as part of the next anagram
            found.  */

        memcpy (buffer[1] - (buf_p - word_p), word_p, buf_p - word_p);
        word_p = buffer[1] - (buf_p - word_p);
    }

    /*  Report read failures    */

    if (count < 0) {
        perror (argv[0]);
        exit (2);
    }

    return exit_status;
}
	/* Anagram: a little filter that emits anagrams of its argument.
	*
	* This program's main point of interest is its speed, which it
	* obtains by using native FPU operations to do the anagram tests.
	*
	* The basic idea is to form a one-to-one mapping of letter values to prime
	* numbers, allowing words to be represented as composite numbers by
	* multiplying together the primes that map each letter in the word.
	* Words formed from the same letters, regardless of order, will then map
	* to the same composite number. Also, if one word's number divides exactly
	* into another word's number, the first word's letters must all appear in
	* the second word.
	*
	*
	* Copyright 2004 Stephen Thomas
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	*
	*
	* Modification history:
	*
	* First release Feb 14 2004, Stephen Thomas (flabdablet yahoo com)
	* Compiles cleanly for me on a Pentium 4 running Linux with gcc 3.2.2:
	* gcc -O3 -fomit-frame-pointer -march=i686 -Wall -o anagram anagram.c
	* Please let me know if you find it useful or at least interesting,
	* and send me a copy if you release something based on it.
	*/

	#include <unistd.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>
	#include <ctype.h>
	#include <math.h>
	#include <errno.h>

	#define READSIZE 4096
	#define DOS_TEXT_FORMAT 0

	int main (int argc, char **argv)
	{
	static const float primes[] = {
	2, 3, 5, 7, 11,
	13, 17, 19, 23, 29,
	31, 37, 41, 43, 47,
	53, 59, 61, 67, 71,
	73, 79, 83, 98, 97,
	101, 103, 107, 109, 113,
	127, 131, 137, 139, 149
	};
	static float factors[512];

	static unsigned char buffer[2][READSIZE];
	ssize_t count;
	int ch;
	unsigned char *buf_p;
	unsigned char *word_p;

	int index, exit_status, length, min_length;
	long double letters_key, word_key, needed_key, inexact;

	/* If no arguments are given, display usage message */

	if (argc < 2) {
	fprintf (
	stderr,
	"Usage: %s LETTERS [LENGTH] [NEEDED]\n"
	"Try `%s --help' for more information.\n",
	argv[0], argv[0]
	);
	exit (0);
	}

	/* Display more detailed help if the single argument
	is "--help" */

	if (argc == 2 && argv[1] != NULL && strcmp (argv[1], "--help") == 0) {
	fprintf (
	stderr,
	"Usage: %s LETTERS [LENGTH] [NEEDED]\n"
	"Search standard input for all words that can be formed from\n"
	"at least LENGTH of LETTERS, including all NEEDED.\n"
	"Example: %s octopus 5 oo </usr/dict/words\n"
	"\n"
	"If NEEDED is not given, find words that can be formed from\n"
	"at least LENGTH of LETTERS. If LENGTH is not given, find\n"
	"words that can be formed from all of LETTERS. Standard input\n"
	"is expected to be a list of words, one per line. Exit status\n"
	"is 0 if match, 1 if no match, and 2 if trouble.\n"
	"\n"
	"Report bugs to flabdablet, yahoo, com.\n",
	argv[0], argv[0]
	);
	exit (0);
	}

	/* Find the smallest floating point number that would cause
	rounding errors when used as a large integer. */

	inexact = 1;
	while (inexact + 1 != inexact) {
	inexact *= 2;
	}

	/* The first argument specifies the list of letters to select anagrams
	from. Build a table of factors containing a unique prime for each
	letter in the argument, and multiply together the factors corres-
	ponding to each letter to build a composite number in letters_key
	that represents the entire list.

	Allocate primes from smallest to largest in an effort to keep their
	product small. This allows reasonably large letter lists to be
	represented within the limits of the long double numeric format. */

	index = 0;
	letters_key = 1;
	length = 0;
	memset (factors, 0, sizeof factors);
	if (DOS_TEXT_FORMAT) factors['\r'] = 1;

	if (argc > 1 && argv[1] != NULL) {
	while ((ch = argv[1][length++]) != 0) {

	/* If letter is new, allocate a corresponding prime factor */

	if (factors[ch] == 0 && index < sizeof primes / sizeof *primes) {
	factors[ch] =
	factors[tolower(ch)] =
	factors[toupper(ch)] = primes[index++];
	}

	/* Form composite number representing entire letter list */

	letters_key *= factors[ch];
	}

	/* If the primes table was exhausted or letters_key is too
	big to avoid rounding errors, the search results will be
	meaningless. */

	if (letters_key == 0 \|\| letters_key >= inexact) {
	fprintf (
	stderr,
	"%s: Can't handle such a big letter set\n",
	argv[0]
	);
	exit (2);
	}
	}

	/* Allow a minimum length to be specified, so partial anagrams can
	be generated (useful for newspaper anagram puzzles). The values
	of the length variables include word-termination characters, so
	user-specified values must be adjusted.

	If no minimum length is specified, anagrams will contain all the
	letters in the first argument. */

	min_length = length;
	if (argc > 2 && argv[2] != NULL) {
	min_length = atoi (argv[2]) + 1;
	if (min_length > length) {
	fprintf (
	stderr,
	"%s: Minimum length longer than letter set\n",
	argv[0]
	);
	exit (2);
	}
	}
	if (DOS_TEXT_FORMAT) ++min_length;

	/* If a minimum length was specified, also allow specification of
	a list of letters that must all be present in each anagram. This
	list is represented as a composite number, needed_key, consisting
	of the product of all the factors corresponding to the letters
	listed. */

	needed_key = 1;
	length = 0;
	if (argc > 3 && argv[3] != NULL) {
	while ((ch = argv[3][length++]) != 0) {
	needed_key *= factors[ch];
	}

	/* If any letters occur that weren't included in argv[1],
	their corresponding factors will be zero, and so will
	needed_key. If there are more occurrences of a given
	letter than there were in argv[1], needed_key will not
	divide exactly into letters_key. */

	if (needed_key == 0 \|\| fmodl (letters_key, needed_key) != 0) {
	fprintf (
	stderr,
	"%s: Needed letters not in letter set\n",
	argv[0]
	);
	exit (2);
	}
	}

	/* Get set up for first input word */

	word_p = buffer[1];
	word_key = 1;
	exit_status = 1;

	/* Read next chunk of input, using the low-level read() function
	for speed */

	while ((count = read (0, buffer[1], READSIZE)) > 0) {

	/* Scan the input buffer, looking for the newlines that mark
	the ends of words */

	buf_p = buffer[1];
	do {
	if ((ch = *buf_p++) != '\n') {

	/* Each non-newline gets its corresponding prime
	factor multiplied into word_key, the composite
	number that represents all the letters in the
	current input word. This is all the processing
	needed for most characters in the input buffer. */

	word_key *= factors[ch];
	}
	else {

	/* End of word found: if word contains only letters
	from argv[1] and is long enough and has no more
	repeats of those letters than occur in argv[1]
	and contains all the letters from argv[3],
	write it out and remember an anagram was found. */

	if (word_key > 0 &&
	buf_p - word_p >= min_length &&
	fmodl (letters_key, word_key) == 0 &&
	fmodl (word_key, needed_key) == 0
	) {
	fwrite (word_p, buf_p - word_p, 1, stdout);
	exit_status = 0;
	}

	/* Get set up for next input word */

	word_p = buf_p;
	word_key = 1;
	}
	}
	while (--count);

	/* After processing the whole buffer, transfer any leftover
	word parts to a spot just before the start of the buffer
	in case they need outputting as part of the next anagram
	found. */

	memcpy (buffer[1] - (buf_p - word_p), word_p, buf_p - word_p);
	word_p = buffer[1] - (buf_p - word_p);
	}

	/* Report read failures */

	if (count < 0) {
	perror (argv[0]);
	exit (2);
	}

	return exit_status;
	}