Skip to content

Instantly share code, notes, and snippets.

@meehow
Created December 28, 2013 22:53
Show Gist options
  • Save meehow/8165246 to your computer and use it in GitHub Desktop.
Save meehow/8165246 to your computer and use it in GitHub Desktop.
Anagram
/* Anagram: a little filter that emits anagrams of its argument.
*
* This program's main point of interest is its speed, which it
* obtains by using native FPU operations to do the anagram tests.
*
* The basic idea is to form a one-to-one mapping of letter values to prime
* numbers, allowing words to be represented as composite numbers by
* multiplying together the primes that map each letter in the word.
* Words formed from the same letters, regardless of order, will then map
* to the same composite number. Also, if one word's number divides exactly
* into another word's number, the first word's letters must all appear in
* the second word.
*
*
* Copyright 2004 Stephen Thomas
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*
* Modification history:
*
* First release Feb 14 2004, Stephen Thomas (flabdablet yahoo com)
* Compiles cleanly for me on a Pentium 4 running Linux with gcc 3.2.2:
* gcc -O3 -fomit-frame-pointer -march=i686 -Wall -o anagram anagram.c
* Please let me know if you find it useful or at least interesting,
* and send me a copy if you release something based on it.
*/
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include <errno.h>
#define READSIZE 4096
#define DOS_TEXT_FORMAT 0
int main (int argc, char **argv)
{
static const float primes[] = {
2, 3, 5, 7, 11,
13, 17, 19, 23, 29,
31, 37, 41, 43, 47,
53, 59, 61, 67, 71,
73, 79, 83, 98, 97,
101, 103, 107, 109, 113,
127, 131, 137, 139, 149
};
static float factors[512];
static unsigned char buffer[2][READSIZE];
ssize_t count;
int ch;
unsigned char *buf_p;
unsigned char *word_p;
int index, exit_status, length, min_length;
long double letters_key, word_key, needed_key, inexact;
/* If no arguments are given, display usage message */
if (argc < 2) {
fprintf (
stderr,
"Usage: %s LETTERS [LENGTH] [NEEDED]\n"
"Try `%s --help' for more information.\n",
argv[0], argv[0]
);
exit (0);
}
/* Display more detailed help if the single argument
is "--help" */
if (argc == 2 && argv[1] != NULL && strcmp (argv[1], "--help") == 0) {
fprintf (
stderr,
"Usage: %s LETTERS [LENGTH] [NEEDED]\n"
"Search standard input for all words that can be formed from\n"
"at least LENGTH of LETTERS, including all NEEDED.\n"
"Example: %s octopus 5 oo </usr/dict/words\n"
"\n"
"If NEEDED is not given, find words that can be formed from\n"
"at least LENGTH of LETTERS. If LENGTH is not given, find\n"
"words that can be formed from all of LETTERS. Standard input\n"
"is expected to be a list of words, one per line. Exit status\n"
"is 0 if match, 1 if no match, and 2 if trouble.\n"
"\n"
"Report bugs to flabdablet, yahoo, com.\n",
argv[0], argv[0]
);
exit (0);
}
/* Find the smallest floating point number that would cause
rounding errors when used as a large integer. */
inexact = 1;
while (inexact + 1 != inexact) {
inexact *= 2;
}
/* The first argument specifies the list of letters to select anagrams
from. Build a table of factors containing a unique prime for each
letter in the argument, and multiply together the factors corres-
ponding to each letter to build a composite number in letters_key
that represents the entire list.
Allocate primes from smallest to largest in an effort to keep their
product small. This allows reasonably large letter lists to be
represented within the limits of the long double numeric format. */
index = 0;
letters_key = 1;
length = 0;
memset (factors, 0, sizeof factors);
if (DOS_TEXT_FORMAT) factors['\r'] = 1;
if (argc > 1 && argv[1] != NULL) {
while ((ch = argv[1][length++]) != 0) {
/* If letter is new, allocate a corresponding prime factor */
if (factors[ch] == 0 && index < sizeof primes / sizeof *primes) {
factors[ch] =
factors[tolower(ch)] =
factors[toupper(ch)] = primes[index++];
}
/* Form composite number representing entire letter list */
letters_key *= factors[ch];
}
/* If the primes table was exhausted or letters_key is too
big to avoid rounding errors, the search results will be
meaningless. */
if (letters_key == 0 || letters_key >= inexact) {
fprintf (
stderr,
"%s: Can't handle such a big letter set\n",
argv[0]
);
exit (2);
}
}
/* Allow a minimum length to be specified, so partial anagrams can
be generated (useful for newspaper anagram puzzles). The values
of the length variables include word-termination characters, so
user-specified values must be adjusted.
If no minimum length is specified, anagrams will contain all the
letters in the first argument. */
min_length = length;
if (argc > 2 && argv[2] != NULL) {
min_length = atoi (argv[2]) + 1;
if (min_length > length) {
fprintf (
stderr,
"%s: Minimum length longer than letter set\n",
argv[0]
);
exit (2);
}
}
if (DOS_TEXT_FORMAT) ++min_length;
/* If a minimum length was specified, also allow specification of
a list of letters that must all be present in each anagram. This
list is represented as a composite number, needed_key, consisting
of the product of all the factors corresponding to the letters
listed. */
needed_key = 1;
length = 0;
if (argc > 3 && argv[3] != NULL) {
while ((ch = argv[3][length++]) != 0) {
needed_key *= factors[ch];
}
/* If any letters occur that weren't included in argv[1],
their corresponding factors will be zero, and so will
needed_key. If there are more occurrences of a given
letter than there were in argv[1], needed_key will not
divide exactly into letters_key. */
if (needed_key == 0 || fmodl (letters_key, needed_key) != 0) {
fprintf (
stderr,
"%s: Needed letters not in letter set\n",
argv[0]
);
exit (2);
}
}
/* Get set up for first input word */
word_p = buffer[1];
word_key = 1;
exit_status = 1;
/* Read next chunk of input, using the low-level read() function
for speed */
while ((count = read (0, buffer[1], READSIZE)) > 0) {
/* Scan the input buffer, looking for the newlines that mark
the ends of words */
buf_p = buffer[1];
do {
if ((ch = *buf_p++) != '\n') {
/* Each non-newline gets its corresponding prime
factor multiplied into word_key, the composite
number that represents all the letters in the
current input word. This is all the processing
needed for most characters in the input buffer. */
word_key *= factors[ch];
}
else {
/* End of word found: if word contains only letters
from argv[1] and is long enough and has no more
repeats of those letters than occur in argv[1]
and contains all the letters from argv[3],
write it out and remember an anagram was found. */
if (word_key > 0 &&
buf_p - word_p >= min_length &&
fmodl (letters_key, word_key) == 0 &&
fmodl (word_key, needed_key) == 0
) {
fwrite (word_p, buf_p - word_p, 1, stdout);
exit_status = 0;
}
/* Get set up for next input word */
word_p = buf_p;
word_key = 1;
}
}
while (--count);
/* After processing the whole buffer, transfer any leftover
word parts to a spot just before the start of the buffer
in case they need outputting as part of the next anagram
found. */
memcpy (buffer[1] - (buf_p - word_p), word_p, buf_p - word_p);
word_p = buffer[1] - (buf_p - word_p);
}
/* Report read failures */
if (count < 0) {
perror (argv[0]);
exit (2);
}
return exit_status;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment