arsane/spell.c

## spell.c
/*
 * spell.c --- spell corrector
 *
 * Copyright  (C)  2007  Marcelo Toledo <marcelo@marcelotoledo.org>
 *
 * Version: 1.0
 * Keywords: spell corrector
 * Author: Marcelo Toledo <marcelo@marcelotoledo.org>
 * Maintainer: Marcelo Toledo <marcelo@marcelotoledo.org>
 * URL: http://www.marcelotoledo.org
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Commentary:
 *
 * See http://www.marcelotoledo.org.
 *
 * Code:
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <search.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdbool.h>

#define DICTIONARY "./big.txt"
#define DICT_SZ    3000000

const char delim[]    = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";
const char alphabet[] = "abcdefghijklmnopqrstuvwxyz";

static char *strtolower(char *word)
{
    char *s;

    for (s = word; *s; s++)
        *s = tolower(*s);

    return word;
}

static ENTRY *find(char *word)
{
    ENTRY e;

    e.key = word;
    return hsearch(e, FIND);
}

static int update(char *word)
{
    ENTRY *e = find(word);

    if (!e)
        return 0;

    e->data++;

    return 1;
}

static int read_file(ENTRY dict)
{
    char *file, *word, *w;
    FILE *fp = fopen(DICTIONARY, "r");
    struct stat sb;

    if (!fp)
        return 0;

    if (stat(DICTIONARY, &sb))
        return 0;

    file = malloc(sb.st_size);
    if (!file) {
        fclose(fp);
        return 0;
    }

    fread(file, sizeof(char), sb.st_size, fp);

    word = strtok(file, delim);
    while(word != NULL) {
        w = strtolower(strdup(word));

        if (!update(w)) {
            dict.key  = w;
            dict.data = 0;
            hsearch(dict, ENTER);
        }

        word = strtok(NULL, delim);
    }

    free(file);
    fclose(fp);

    return 1;
}

static char *concat(char *str1, char *str2, bool bfree)
{
    char * ret;

    if (!str1) {
        str1 = malloc(sizeof(char));
        *str1 = '\0';
    }

    str1 = realloc(str1, strlen(str1) + strlen(str2) + 1);
    ret = strcat(str1, str2);
    if (str2 && bfree) {
        free(str2);
    }

    return ret;
}

static int deletion(char *word, char **array, int start_idx)
{
    int i, word_len = strlen(word);

    for (i = 0; i < word_len; i++)
        array[i + start_idx] = concat(
                strndup(word, i),
                strndup(word+i+1, word_len-(i+1)),
                true
                );

    return i;
}

static int transposition(char *word, char **array, int start_idx)
{
    int i, word_len = strlen(word);

    for (i = 0; i < word_len-1; i++)
        array[i + start_idx] = concat(
                    concat(strndup(word, i), strndup(word+i+1, 1), true),
                    concat(strndup(word+i, 1),
                        strndup(word+i+2, word_len-(i+2)), true),
                    true
                    );

    return i;
}

static int alteration(char *word, char **array, int start_idx)
{
    int i, j, k, word_len = strlen(word);
    char c[2] = { 0, 0 };

    for (i = 0, k = 0; i < word_len; i++)
        for (j = 0; j < sizeof(alphabet); j++, k++) {
            c[0] = alphabet[j];
            array[start_idx + k] = concat(
                        concat(strndup(word+0, i),(char *) &c, false),
                        strndup(word+i+1, word_len - (i+1)),
                        true
                        );
        }

    return k;
}

static int insertion(char *word, char **array, int start_idx)
{
    int i, j, k, word_len = strlen(word);
    char c[2] = { 0, 0 };

    for (i = 0, k = 0; i <= word_len; i++)
        for (j = 0; j < sizeof(alphabet); j++, k++) {
            c[0] = alphabet[j];
            array[start_idx + k] = concat(
                        concat(strndup(word, i), (char *) &c, false),
                        strndup(word+i, word_len - i),
                        true
                        );
        }

    return k;
}

static int edits1_rows(char *word)
{
    register int size = strlen(word);

    return (size)                       + // deletion
        (size - 1)                      + // transposition
        (size * sizeof(alphabet))       + // alteration
        (size + 1) * sizeof(alphabet);    // insertion
}

static char **edits1(char *word)
{
    int next_idx;
    char **array = malloc(edits1_rows(word) * sizeof(char *));

    if (!array)
        return NULL;

    next_idx  = deletion(word, array, 0);
    next_idx += transposition(word, array, next_idx);
    next_idx += alteration(word, array, next_idx);
    insertion(word, array, next_idx);

    return array;
}

static int array_exist(char **array, int rows, char *word)
{
    int i;

    for (i = 0; i < rows; i++)
        if (!strcmp(array[i], word))
            return 1;

    return 0;
}

static char **known_edits2(char **array, int rows, int *e2_rows)
{
    int i, j, res_size, e1_rows;
    char **res = NULL, **e1;

    for (i = 0, res_size = 0; i < rows; i++) {
        e1      = edits1(array[i]);
        e1_rows = edits1_rows(array[i]);

        for (j = 0; j < e1_rows; j++)
            if (find(e1[j]) && !array_exist(res, res_size, e1[j])) {
                res             = realloc(res, sizeof(char *) * (res_size + 1));
                res[res_size++] = e1[j];
            }
    }

    *e2_rows = res_size;

    return res;
}

static char *max(char **array, int rows)
{
    char *max_word = NULL;
    int i, max_size = 0;
    ENTRY *e;

    for (i = 0; i < rows; i++) {
        e = find(array[i]);
        if (e && ((int) e->data > max_size)) {
            max_size = (int) e->data;
            max_word = e->key;
        }
    }

    return max_word;
}

static void array_cleanup(char **array, int rows)
{
    int i;

    for (i = 0; i < rows; i++)
        free(array[i]);
}

static char *correct(char *word)
{
    char **e1, **e2, *e1_word, *e2_word, *res_word = word;
    int e1_rows, e2_rows;

    if (find(word))
        return word;

    e1_rows = edits1_rows(word);
    if (e1_rows) {
        e1      = edits1(word);
        e1_word = max(e1, e1_rows);

        if (e1_word) {
            array_cleanup(e1, e1_rows);
            free(e1);
            return e1_word;
        }
    }

    e2 = known_edits2(e1, e1_rows, &e2_rows);
    if (e2_rows) {
        e2_word = max(e2, e2_rows);
        if (e2_word)
            res_word = e2_word;
    }

    array_cleanup(e1, e1_rows);
    array_cleanup(e2, e2_rows);

    free(e1);
    free(e2);

    return res_word;
}

int main(int argc, char **argv)
{
    char *corrected_word;
    ENTRY dict;

    hcreate(DICT_SZ);

    if (!read_file(dict))
        return -1;

    corrected_word = correct(argv[1]);
    if (strcmp(corrected_word, argv[1])) {
        printf("Did you mean \"%s\"?\n", corrected_word);
    } else {
        printf("\"%s\" is correct!\n", argv[1]);
    }

    hdestroy();
    return 0;
}

## spellcheck.py
#!/usr/bin/python
import re, collections

# from http://norvig.com/spell-correct.html
# bit.txt - http://norvig.com/big.txt

def words(text): return re.findall('[a-z]+', text.lower())

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(file('big.txt').read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)
	/*
	* spell.c --- spell corrector
	*
	* Copyright (C) 2007 Marcelo Toledo <marcelo@marcelotoledo.org>
	*
	* Version: 1.0
	* Keywords: spell corrector
	* Author: Marcelo Toledo <marcelo@marcelotoledo.org>
	* Maintainer: Marcelo Toledo <marcelo@marcelotoledo.org>
	* URL: http://www.marcelotoledo.org
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	*
	* Commentary:
	*
	* See http://www.marcelotoledo.org.
	*
	* Code:
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <ctype.h>
	#include <search.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <stdbool.h>

	#define DICTIONARY "./big.txt"
	#define DICT_SZ 3000000

	const char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/\|\\=1234567890 \t\n";
	const char alphabet[] = "abcdefghijklmnopqrstuvwxyz";

	static char strtolower(char word)
	{
	char *s;

	for (s = word; *s; s++)
	s = tolower(s);

	return word;
	}

	static ENTRY find(char word)
	{
	ENTRY e;

	e.key = word;
	return hsearch(e, FIND);
	}

	static int update(char *word)
	{
	ENTRY *e = find(word);

	if (!e)
	return 0;

	e->data++;

	return 1;
	}

	static int read_file(ENTRY dict)
	{
	char file, word, *w;
	FILE *fp = fopen(DICTIONARY, "r");
	struct stat sb;

	if (!fp)
	return 0;

	if (stat(DICTIONARY, &sb))
	return 0;

	file = malloc(sb.st_size);
	if (!file) {
	fclose(fp);
	return 0;
	}

	fread(file, sizeof(char), sb.st_size, fp);

	word = strtok(file, delim);
	while(word != NULL) {
	w = strtolower(strdup(word));

	if (!update(w)) {
	dict.key = w;
	dict.data = 0;
	hsearch(dict, ENTER);
	}

	word = strtok(NULL, delim);
	}

	free(file);
	fclose(fp);

	return 1;
	}

	static char concat(char str1, char *str2, bool bfree)
	{
	char * ret;

	if (!str1) {
	str1 = malloc(sizeof(char));
	*str1 = '\0';
	}

	str1 = realloc(str1, strlen(str1) + strlen(str2) + 1);
	ret = strcat(str1, str2);
	if (str2 && bfree) {
	free(str2);
	}

	return ret;
	}

	static int deletion(char word, char *array, int start_idx)
	{
	int i, word_len = strlen(word);

	for (i = 0; i < word_len; i++)
	array[i + start_idx] = concat(
	strndup(word, i),
	strndup(word+i+1, word_len-(i+1)),
	true
	);

	return i;
	}

	static int transposition(char word, char *array, int start_idx)
	{
	int i, word_len = strlen(word);

	for (i = 0; i < word_len-1; i++)
	array[i + start_idx] = concat(
	concat(strndup(word, i), strndup(word+i+1, 1), true),
	concat(strndup(word+i, 1),
	strndup(word+i+2, word_len-(i+2)), true),
	true
	);

	return i;
	}

	static int alteration(char word, char *array, int start_idx)
	{
	int i, j, k, word_len = strlen(word);
	char c[2] = { 0, 0 };

	for (i = 0, k = 0; i < word_len; i++)
	for (j = 0; j < sizeof(alphabet); j++, k++) {
	c[0] = alphabet[j];
	array[start_idx + k] = concat(
	concat(strndup(word+0, i),(char *) &c, false),
	strndup(word+i+1, word_len - (i+1)),
	true
	);
	}

	return k;
	}

	static int insertion(char word, char *array, int start_idx)
	{
	int i, j, k, word_len = strlen(word);
	char c[2] = { 0, 0 };

	for (i = 0, k = 0; i <= word_len; i++)
	for (j = 0; j < sizeof(alphabet); j++, k++) {
	c[0] = alphabet[j];
	array[start_idx + k] = concat(
	concat(strndup(word, i), (char *) &c, false),
	strndup(word+i, word_len - i),
	true
	);
	}

	return k;
	}

	static int edits1_rows(char *word)
	{
	register int size = strlen(word);

	return (size) + // deletion
	(size - 1) + // transposition
	(size * sizeof(alphabet)) + // alteration
	(size + 1) * sizeof(alphabet); // insertion
	}

	static char *edits1(char word)
	{
	int next_idx;
	char *array = malloc(edits1_rows(word) sizeof(char *));

	if (!array)
	return NULL;

	next_idx = deletion(word, array, 0);
	next_idx += transposition(word, array, next_idx);
	next_idx += alteration(word, array, next_idx);
	insertion(word, array, next_idx);

	return array;
	}

	static int array_exist(char *array, int rows, char word)
	{
	int i;

	for (i = 0; i < rows; i++)
	if (!strcmp(array[i], word))
	return 1;

	return 0;
	}

	static char known_edits2(char array, int rows, int *e2_rows)
	{
	int i, j, res_size, e1_rows;
	char res = NULL, e1;

	for (i = 0, res_size = 0; i < rows; i++) {
	e1 = edits1(array[i]);
	e1_rows = edits1_rows(array[i]);

	for (j = 0; j < e1_rows; j++)
	if (find(e1[j]) && !array_exist(res, res_size, e1[j])) {
	res = realloc(res, sizeof(char ) (res_size + 1));
	res[res_size++] = e1[j];
	}
	}

	*e2_rows = res_size;

	return res;
	}

	static char max(char *array, int rows)
	{
	char *max_word = NULL;
	int i, max_size = 0;
	ENTRY *e;

	for (i = 0; i < rows; i++) {
	e = find(array[i]);
	if (e && ((int) e->data > max_size)) {
	max_size = (int) e->data;
	max_word = e->key;
	}
	}

	return max_word;
	}

	static void array_cleanup(char **array, int rows)
	{
	int i;

	for (i = 0; i < rows; i++)
	free(array[i]);
	}

	static char correct(char word)
	{
	char e1, e2, e1_word, e2_word, *res_word = word;
	int e1_rows, e2_rows;

	if (find(word))
	return word;

	e1_rows = edits1_rows(word);
	if (e1_rows) {
	e1 = edits1(word);
	e1_word = max(e1, e1_rows);

	if (e1_word) {
	array_cleanup(e1, e1_rows);
	free(e1);
	return e1_word;
	}
	}

	e2 = known_edits2(e1, e1_rows, &e2_rows);
	if (e2_rows) {
	e2_word = max(e2, e2_rows);
	if (e2_word)
	res_word = e2_word;
	}

	array_cleanup(e1, e1_rows);
	array_cleanup(e2, e2_rows);

	free(e1);
	free(e2);

	return res_word;
	}

	int main(int argc, char **argv)
	{
	char *corrected_word;
	ENTRY dict;

	hcreate(DICT_SZ);

	if (!read_file(dict))
	return -1;

	corrected_word = correct(argv[1]);
	if (strcmp(corrected_word, argv[1])) {
	printf("Did you mean \"%s\"?\n", corrected_word);
	} else {
	printf("\"%s\" is correct!\n", argv[1]);
	}

	hdestroy();
	return 0;
	}
	#!/usr/bin/python
	import re, collections

	# from http://norvig.com/spell-correct.html
	# bit.txt - http://norvig.com/big.txt

	def words(text): return re.findall('[a-z]+', text.lower())

	def train(features):
	model = collections.defaultdict(lambda: 1)
	for f in features:
	model[f] += 1
	return model

	NWORDS = train(words(file('big.txt').read()))

	alphabet = 'abcdefghijklmnopqrstuvwxyz'

	def edits1(word):
	splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
	deletes = [a + b[1:] for a, b in splits if b]
	transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
	replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
	inserts = [a + c + b for a, b in splits for c in alphabet]
	return set(deletes + transposes + replaces + inserts)

	def known_edits2(word):
	return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

	def known(words): return set(w for w in words if w in NWORDS)

	def correct(word):
	candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
	return max(candidates, key=NWORDS.get)