Skip to content

Instantly share code, notes, and snippets.

@dylan-evans
Created February 4, 2012 14:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dylan-evans/1738281 to your computer and use it in GitHub Desktop.
Save dylan-evans/1738281 to your computer and use it in GitHub Desktop.
Simple hash
/** \file simple_hash.h
*
* This simple hashing routine was created after finding the hsearch function
* a bit inadequate, due to the lack of a reentrant function in posix and
* the inability to do simple operations such as delete... and for fun.
*/
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <malloc.h>
#include "simple_hash.h"
/**
* Allocate a table object and set the hash function.
*
* \param size The number of entries in the hash.
* \param hash_func The hash function. This can be set to null for sh_indexer.
*
* \return Newly allocated hash table.
*/
sh_table *sh_create(int size, int (*hash_func)(char *))
{
sh_table *tab = malloc(sizeof(sh_table));
tab->size = size;
tab->table = (sh_bucket *)malloc(sizeof(sh_bucket) * size);
memset(tab->table, '\0', sizeof(sh_bucket) * size);
if(hash_func)
tab->hash_func = hash_func;
else
tab->hash_func = &sh_indexer;
return tab;
}
/**
* A very simple hash routine for use in testing. Actually this has provided
* a pretty resonable spread for the assorted texts i have been running through
* it.
*
* \param str The string to be hashed.
* \return The hash key.
*/
int sh_indexer(char *str)
{
int hash, i;
for(hash = 0, i = 0; *str; str++, i++) {
hash ^= ((int)(*str) << (i % 32));
}
return hash;
}
/**
* Find an entry. If no matching key is found then the entry and index is
* set to the position of the next empty bucket, of course this rule is
* dependent on there always being an empty bucket which requires a new
* sh_bucket to be allocated as soon as all the buckets are in a sh_bucket
* are filled.
*
* \param tab The table object.
* \param key The key string.
* \param ent A location to store the matching entry.
* \param index The matching index.
*
* \return 1 when an entry is found.
*/
inline int sh_find_entry(sh_table *tab, char *key, sh_bucket **ent, int *index)
{
int hkey;
char *lkey, *rkey;
hkey = (*tab->hash_func)(key) % tab->size;
for(*ent = &tab->table[hkey]; *ent; *ent = (*ent)->next) {
for((*index) = 0; (*index) < SH_MAX_BUCKET; (*index)++) {
lkey = (*ent)->key[*index];
if(!lkey) return 0;
for(rkey = key; *rkey == *lkey; rkey++, lkey++)
if(!*rkey) return 1;
}
}
return 0;
}
/**
* Add a new value to the hash table. This function does nothing if en entry
* with a matching key already exists.
*
* \param tab The hash table to add to.
* \param key The key to add.
* \param val The value to associate with the key.
*
* \return 1 on success, 0 if the key already exists.
* \sa sh_set sh_replace
*/
int sh_add(sh_table *tab, char *key, void *val)
{
int index = 0;
sh_bucket *ent;
if(sh_find_entry(tab, key, &ent, &index))
return 0; // Already exists
ent->key[index] = key;
ent->value[index] = val;
if(index == (SH_MAX_BUCKET - 1)) {
ent->next = (sh_bucket*)malloc(sizeof(sh_bucket));
memset(ent->next, '\0', sizeof(sh_bucket));
ent->next->prev = ent;
}
return 1;
}
/**
* This function adds an item to the hash, adding it if
* it doesn't already exist.
*
* \param tab The hash table.
* \param key The key to add.
* \param val The value to be set.
*
* \return Always returns 1
* \sa sh_add sh_replace
*/
int sh_set(sh_table *tab, char *key, void *val)
{
sh_bucket *ent;
int index = 0;
sh_find_entry(tab, key, &ent, &index);
ent->key[index] = (char*) malloc(strlen(key));
strcpy(ent->key[index], key);
ent->value[index] = val;
if(index == (SH_MAX_BUCKET - 1) && !ent->next) {
ent->next = (sh_bucket*)malloc(sizeof(sh_bucket));
}
return 1;
}
/**
* Replace an existing item.
*
* \param tab The table object.
* \param key The key string.
* \param val The pointer to store.
*
* \return 1 on success, 0 if the item doesn't exist.
*/
int sh_replace(sh_table *tab, char *key, void *val)
{
int index;
sh_bucket *ent;
if(!sh_find_entry(tab, key, &ent, &index))
return 0; // It doesn't exist
ent->value[index] = val;
return 1;
}
/**
* Delete a matching item.
*
* \param tab The sh_table to check.
* \param key The key to search for.
*
* \return 0 if no match is found.
*/
int sh_delete(sh_table *tab, char *key)
{
int index = 0, n_index = 0;
sh_bucket *ent, *n_ent;
if(!sh_find_entry(tab, key, &ent, &index))
return 0;
// This routine shifts each entry into the place of the higher bucket
n_ent = ent;
while(ent && ent->key[index]) {
n_index = index + 1;
if(n_index == SH_MAX_BUCKET) {
n_index = 0;
n_ent = ent->next;
}
ent->key[index] = n_ent->key[n_index];
ent->value[index] = n_ent->value[n_index];
ent = n_ent;
index = n_index;
}
if(n_index == SH_MAX_BUCKET)
free(ent->next);
n_ent->key[n_index] = NULL;
n_ent->value[n_index] = NULL;
return 1;
}
/**
* Find an item in the hash.
*
* \param tab The sh_table instance to search.
* \param key The key to search for.
*
* \return The entry value or NULL if no item is found.
*/
void *sh_find(sh_table *tab, char *key)
{
int index;
sh_bucket *ent;
// It would be better if you just didn't do this
if(!tab || !key) return NULL;
if(!sh_find_entry(tab, key, &ent, &index))
return NULL;
return ent->value[index];
}
/**
* Send each key/value pair to a callback function. Although in many ways this
* beats the purpose of using a hash it is very useful to do a brute force
* scan of a hash. This function scans through each item in whatever position
* it happens to be stored without any attempt at sorting.
*
* The callback has the paramaters (key, value, arg),
* \warning The function MUST return a positive value to continue scanning.
* Returning a zero value will abort the scan and return.
*
* \warning If an item is deleted in a scan the next item in an entry will be
* missed if it exists, if it is the last item in an entry then it will likely
* result in a segfault. Just don't do it.
*
* \param tab The table to scan.
* \param arg An argument passed to the callback function, may be NULL.
* \param cb The callback function
*
* \return True if the scan completed, zero if cancelled.
*/
int sh_each(sh_table *tab, void *arg, int (*cb)(char *, void *, void *))
{
int i, e;
sh_bucket *ent;
for(i = 0; i < tab->size; i++) {
ent = &tab->table[i];
while(ent && ent->key[0]) {
for(e = 0; e < SH_MAX_BUCKET; e++) {
if(!ent->key[e]) break; // End of the bucket
if(!(*cb)(ent->key[e], ent->value[e], tab))
return 0;
}
ent = ent->next;
}
}
return 1;
}
#ifndef SIMPLE_HASH_H
#define SIMPLE_HASH_H 1
#define SH_MAX_BUCKET 9
typedef struct _sh_bucket sh_bucket;
struct _sh_bucket {
char *key[SH_MAX_BUCKET];
void *value[SH_MAX_BUCKET];
sh_bucket *prev, *next;
};
typedef struct _sh_table sh_table;
struct _sh_table {
unsigned int size;
sh_bucket *table;
int (*hash_func)(char *);
};
sh_table *sh_create(int, int (*)(char*));
int sh_indexer(char *);
int sh_add(sh_table *, char *, void *);
int sh_set(sh_table *, char *, void *);
int sh_delete(sh_table *, char *);
void *sh_find(sh_table *, char *);
int sh_each(sh_table *, void *, int (*)(char *, void *, void *));
#endif
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>
#include "simple_hash.h"
// This is probably going to be different on some systems
#define SYSTEM_DICT "/usr/share/dict/words"
/** \file spellcheck.c
*
* This is a basic spell checker which loads data from the system dictionary
* into a simple hash table and scans stdin for words not matching any entries.
*/
int test_word(sh_table *tab, char *buf, int *col);
int read_dict(FILE *f, char *buf);
int main()
{
int len, line, col;
int word_count, err_count;
char buf[256];
FILE *f;
sh_table *tab;
/* With a bit of experimentation i found 8192 to give a pretty even spread
* with the default hash function */
tab = sh_create(8192, NULL);
f = fopen(SYSTEM_DICT, "r");
if(!f) {
perror("Couldn't open dictionary");
exit(1);
}
while(!read_dict(f, buf))
sh_set(tab, buf, (void*)1);
fclose(f);
err_count = word_count = 0;
line = col = 0; // The position in the file
while(!feof(stdin) && fgets(buf, 256, stdin)) {
line++;
// Each line
for(col = 0; buf[col]; col++) {
// Each character
if(isalpha(buf[col])) {
if( (len = test_word(tab, buf, &col)) ) {
err_count++;
printf("Bad word: '%.*s' (line: %d, column: %d)\n",
len, (buf + (col - len)), line, (col - len));
printf("%s%*s", buf, (col - len), "");
while(len--) putchar('^');
putchar('\n');
}
word_count++;
}
}
}
printf("%d words found with %d errors on %d lines\n",
word_count, err_count, line);
return err_count;
}
/**
* Read a word into the dictionary converting to lower case.
*
* \param f The input file.
* \param buf The input buffer to use.
*
* \return The end of file status.
*/
int read_dict(FILE *f, char *buf)
{
int i;
fgets(buf, 256, f);
for(i = 0; buf[i] != '\0'; i++) {
if(buf[i] == '\n')
buf[i] = '\0';
else if(isupper(buf[i]))
buf[i] += 32;
}
return feof(f);
}
/**
* Parse and test the spelling of a single word.
*
* \param tab The sh_table with dictionary in it.
* \param buf The input buffer.
* \param col A pointer to the column index.
*
* \return 0 if the word is found, otherwise the length of the word.
*/
int test_word(sh_table *tab, char *buf, int *col)
{
char word[256];
int wi;
buf += (*col);
for(wi = 0; wi < 256 && (isalpha(*buf) || *buf == '\''); buf++, wi++)
word[wi] = tolower(*buf);
if(word[wi-1] == '\'') wi--;
word[wi] = '\0';
(*col) += wi ;
if(sh_find(tab, word)) return 0;
return wi;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment