Skip to content

Instantly share code, notes, and snippets.

@EmilHernvall
Created May 3, 2011 17:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EmilHernvall/953745 to your computer and use it in GitHub Desktop.
Save EmilHernvall/953745 to your computer and use it in GitHub Desktop.
Find the most popular words in a file - Hashtable based version
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct word_ {
char *str;
int c;
struct word_ *next;
} word;
typedef struct bucket_ {
char **key;
int *count;
int size;
} bucket;
typedef struct hashset_ {
int size;
int count;
int collisions;
double loadfactor;
bucket *buckets;
} hashset;
typedef struct hashset_iterator_ {
hashset *hs;
int init;
int bucket;
int pos;
} hashset_iterator;
unsigned int hashset_keyfunc(char *key);
void hashset_init(hashset *hs, int size, double loadfactor);
void hashset_rehash(hashset *hs);
void hashset_insert(hashset *hs, char *key, int initCount, int copyKey);
hashset_iterator *hashset_newiterator(hashset *hs);
int hashset_iterate(hashset_iterator *it, char **key);
void hashset_freeiterator(hashset_iterator *it);
void hashset_free(hashset *hs, int freeKeys);
unsigned int hashset_keyfunc(char *key)
{
unsigned int hash = 17;
while (*key) {
hash = 31 * hash + *key;
key++;
}
return hash;
}
void hashset_init(hashset *hs, int size, double loadfactor)
{
hs->size = size;
hs->loadfactor = loadfactor;
hs->count = 0;
hs->collisions = 0;
hs->buckets = (bucket*)malloc(sizeof(bucket) * size);
memset(hs->buckets, 0, sizeof(bucket) * size);
}
void hashset_rehash(hashset *hs)
{
int newSize = hs->size * 2;
hashset newHs;
hashset_init(&newHs, newSize, hs->loadfactor);
hashset_iterator *it = hashset_newiterator(hs);
char *key;
int count;
while ((count = hashset_iterate(it, &key)) != -1) {
hashset_insert(&newHs, key, count, 0);
}
hashset_freeiterator(it);
free(hs->buckets);
memcpy(hs, &newHs, sizeof(hashset));
}
void hashset_insert(hashset *hs, char *key, int initCount, int copyKey)
{
if (hs->count > hs->size * hs->loadfactor) {
//printf("Rehashing\n");
hashset_rehash(hs);
}
unsigned int hash = hashset_keyfunc(key);
unsigned int pos = (1001 + hash) % hs->size;
bucket *bucket = &hs->buckets[pos];
int i;
for (i = 0; i < bucket->size; i++) {
if (strcmp(key, bucket->key[i]) == 0) {
bucket->count[i]++;
return;
}
}
if (bucket->size >= 1) {
hs->collisions++;
}
hs->count++;
int newSize = ++bucket->size;
bucket->key = (char**)realloc(bucket->key, sizeof(char*) * newSize);
bucket->count = (int*)realloc(bucket->count, sizeof(int*) * newSize);
bucket->key[newSize-1] = copyKey ? strdup(key) : key;
bucket->count[newSize-1] = initCount;
}
hashset_iterator *hashset_newiterator(hashset *hs)
{
hashset_iterator *it = (hashset_iterator*)malloc(sizeof(hashset_iterator));
it->hs = hs;
it->init = 1;
it->bucket = 0;
it->pos = 0;
return it;
}
int hashset_iterate(hashset_iterator *it, char **key)
{
hashset *hs = it->hs;
int i, j;
for (i = it->bucket; i < hs->size; i++) {
bucket *bucket = &hs->buckets[i];
for (j = it->pos; j < bucket->size; j++) {
it->bucket = i;
it->pos = j + 1;
*key = bucket->key[j];
return bucket->count[j];
}
it->pos = 0;
}
return -1;
}
void hashset_freeiterator(hashset_iterator *it)
{
free(it);
}
void hashset_free(hashset *hs, int freeKeys)
{
int i, j;
for (i = 0; i < hs->size; i++) {
bucket *bucket = &hs->buckets[i];
if (freeKeys) {
for (j = 0; j < bucket->size; j++) {
free(bucket->key[j]);
}
}
free(bucket->count);
}
free(hs->buckets);
}
word *merge_sort(word *words, int count, int level, int (*cmpfunc)(word*, word*))
{
if (words == NULL) {
return words;
}
//printf("%d\n", count);
//print_list(words, level);
if (count == 1) {
words->next = NULL;
return words;
}
// split the list into two parts
int split = count / 2;
word *left, *right;
word *current_word;
int i = 0;
for (current_word = words; current_word != NULL; current_word = current_word->next) {
if (i == split - 1 || current_word->next == NULL) {
left = words;
right = current_word->next;
current_word->next = NULL;
break;
}
i++;
}
// sort recursively
word *base = merge_sort(left, split, level+1, cmpfunc);
word *cmp = merge_sort(right, count-split, level+1, cmpfunc);
//printf("left: %d\n", left->next == NULL);
//printf("right: %d\n", right->next == NULL);
// merge the two lists
if (cmpfunc(base, cmp) > 0) {
word *tmp = base;
base = cmp;
cmp = tmp;
}
word *result = base;
while (cmp != NULL) {
//printf("%s\n", cmp->str);
if (base->next == NULL) {
base->next = cmp;
break;
}
//printf("cmp: %s %s\n", base->next->str, cmp->str);
if (cmpfunc(base->next, cmp) > 0) {
word *next = cmp->next;
cmp->next = base->next;
base->next = cmp;
cmp = next;
}
else {
base = base->next;
}
}
//print_list(result, level);
//printf("\n");
return result;
}
int countcmp(word *a, word *b)
{
return b->c - a->c;
}
// free the memory for a linked list of words
void free_words(word *words)
{
word *current_word;
for (current_word = words; current_word != NULL; ) {
word *next = current_word->next;
free(current_word->str);
free(current_word);
current_word = next;
}
}
void read_file(char *path, hashset *hs)
{
FILE *fh = fopen(path, "r");
if (fh == NULL) {
return;
}
int pos = 0, wc = 0;
char buffer[1024];
int c = fgetc(fh);
do {
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
buffer[pos] = c;
pos++;
}
else if (pos > 0) {
buffer[pos] = '\0';
//printf("%s\n", buffer);
hashset_insert(hs, buffer, 1, 1);
memset(buffer, 0, sizeof(buffer));
wc++;
pos = 0;
}
} while ((c = fgetc(fh)) != EOF);
printf("wc: %d\n", wc);
fclose(fh);
}
int main(int argc, char* argv[])
{
if (argc < 2) {
printf("usage: %s [file]\n", argv[0]);
return 0;
}
hashset hs;
hashset_init(&hs, 1001, 0.75);
char *path = argv[1];
read_file(path, &hs);
hashset_iterator *it = hashset_newiterator(&hs);
char *key;
int count;
int unique = 0, total = 0;
word *words = NULL;
while ((count = hashset_iterate(it, &key)) != -1) {
//printf("%d %s\n", count, key);
word *newWord = (word*)malloc(sizeof(word));
newWord->next = words;
newWord->str = key;
newWord->c = count;
words = newWord;
unique++;
total += count;
}
//printf("unique: %d, total: %d\n", unique, total);
//printf("collissions: %d\n", hs.collisions);
hashset_freeiterator(it);
hashset_free(&hs, 0);
words = merge_sort(words, unique, 0, countcmp);
// print result
word *current_word;
int j = 0;
for (current_word = words; current_word != NULL; current_word = current_word->next) {
if (j > 10) {
break;
}
printf("%s %d\n", current_word->str, current_word->c);
j++;
}
free_words(words);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment