Created
May 3, 2011 17:13
-
-
Save EmilHernvall/953745 to your computer and use it in GitHub Desktop.
Find the most popular words in a file - Hashtable based version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
typedef struct word_ { | |
char *str; | |
int c; | |
struct word_ *next; | |
} word; | |
typedef struct bucket_ { | |
char **key; | |
int *count; | |
int size; | |
} bucket; | |
typedef struct hashset_ { | |
int size; | |
int count; | |
int collisions; | |
double loadfactor; | |
bucket *buckets; | |
} hashset; | |
typedef struct hashset_iterator_ { | |
hashset *hs; | |
int init; | |
int bucket; | |
int pos; | |
} hashset_iterator; | |
unsigned int hashset_keyfunc(char *key); | |
void hashset_init(hashset *hs, int size, double loadfactor); | |
void hashset_rehash(hashset *hs); | |
void hashset_insert(hashset *hs, char *key, int initCount, int copyKey); | |
hashset_iterator *hashset_newiterator(hashset *hs); | |
int hashset_iterate(hashset_iterator *it, char **key); | |
void hashset_freeiterator(hashset_iterator *it); | |
void hashset_free(hashset *hs, int freeKeys); | |
unsigned int hashset_keyfunc(char *key) | |
{ | |
unsigned int hash = 17; | |
while (*key) { | |
hash = 31 * hash + *key; | |
key++; | |
} | |
return hash; | |
} | |
void hashset_init(hashset *hs, int size, double loadfactor) | |
{ | |
hs->size = size; | |
hs->loadfactor = loadfactor; | |
hs->count = 0; | |
hs->collisions = 0; | |
hs->buckets = (bucket*)malloc(sizeof(bucket) * size); | |
memset(hs->buckets, 0, sizeof(bucket) * size); | |
} | |
void hashset_rehash(hashset *hs) | |
{ | |
int newSize = hs->size * 2; | |
hashset newHs; | |
hashset_init(&newHs, newSize, hs->loadfactor); | |
hashset_iterator *it = hashset_newiterator(hs); | |
char *key; | |
int count; | |
while ((count = hashset_iterate(it, &key)) != -1) { | |
hashset_insert(&newHs, key, count, 0); | |
} | |
hashset_freeiterator(it); | |
free(hs->buckets); | |
memcpy(hs, &newHs, sizeof(hashset)); | |
} | |
void hashset_insert(hashset *hs, char *key, int initCount, int copyKey) | |
{ | |
if (hs->count > hs->size * hs->loadfactor) { | |
//printf("Rehashing\n"); | |
hashset_rehash(hs); | |
} | |
unsigned int hash = hashset_keyfunc(key); | |
unsigned int pos = (1001 + hash) % hs->size; | |
bucket *bucket = &hs->buckets[pos]; | |
int i; | |
for (i = 0; i < bucket->size; i++) { | |
if (strcmp(key, bucket->key[i]) == 0) { | |
bucket->count[i]++; | |
return; | |
} | |
} | |
if (bucket->size >= 1) { | |
hs->collisions++; | |
} | |
hs->count++; | |
int newSize = ++bucket->size; | |
bucket->key = (char**)realloc(bucket->key, sizeof(char*) * newSize); | |
bucket->count = (int*)realloc(bucket->count, sizeof(int*) * newSize); | |
bucket->key[newSize-1] = copyKey ? strdup(key) : key; | |
bucket->count[newSize-1] = initCount; | |
} | |
hashset_iterator *hashset_newiterator(hashset *hs) | |
{ | |
hashset_iterator *it = (hashset_iterator*)malloc(sizeof(hashset_iterator)); | |
it->hs = hs; | |
it->init = 1; | |
it->bucket = 0; | |
it->pos = 0; | |
return it; | |
} | |
int hashset_iterate(hashset_iterator *it, char **key) | |
{ | |
hashset *hs = it->hs; | |
int i, j; | |
for (i = it->bucket; i < hs->size; i++) { | |
bucket *bucket = &hs->buckets[i]; | |
for (j = it->pos; j < bucket->size; j++) { | |
it->bucket = i; | |
it->pos = j + 1; | |
*key = bucket->key[j]; | |
return bucket->count[j]; | |
} | |
it->pos = 0; | |
} | |
return -1; | |
} | |
void hashset_freeiterator(hashset_iterator *it) | |
{ | |
free(it); | |
} | |
void hashset_free(hashset *hs, int freeKeys) | |
{ | |
int i, j; | |
for (i = 0; i < hs->size; i++) { | |
bucket *bucket = &hs->buckets[i]; | |
if (freeKeys) { | |
for (j = 0; j < bucket->size; j++) { | |
free(bucket->key[j]); | |
} | |
} | |
free(bucket->count); | |
} | |
free(hs->buckets); | |
} | |
word *merge_sort(word *words, int count, int level, int (*cmpfunc)(word*, word*)) | |
{ | |
if (words == NULL) { | |
return words; | |
} | |
//printf("%d\n", count); | |
//print_list(words, level); | |
if (count == 1) { | |
words->next = NULL; | |
return words; | |
} | |
// split the list into two parts | |
int split = count / 2; | |
word *left, *right; | |
word *current_word; | |
int i = 0; | |
for (current_word = words; current_word != NULL; current_word = current_word->next) { | |
if (i == split - 1 || current_word->next == NULL) { | |
left = words; | |
right = current_word->next; | |
current_word->next = NULL; | |
break; | |
} | |
i++; | |
} | |
// sort recursively | |
word *base = merge_sort(left, split, level+1, cmpfunc); | |
word *cmp = merge_sort(right, count-split, level+1, cmpfunc); | |
//printf("left: %d\n", left->next == NULL); | |
//printf("right: %d\n", right->next == NULL); | |
// merge the two lists | |
if (cmpfunc(base, cmp) > 0) { | |
word *tmp = base; | |
base = cmp; | |
cmp = tmp; | |
} | |
word *result = base; | |
while (cmp != NULL) { | |
//printf("%s\n", cmp->str); | |
if (base->next == NULL) { | |
base->next = cmp; | |
break; | |
} | |
//printf("cmp: %s %s\n", base->next->str, cmp->str); | |
if (cmpfunc(base->next, cmp) > 0) { | |
word *next = cmp->next; | |
cmp->next = base->next; | |
base->next = cmp; | |
cmp = next; | |
} | |
else { | |
base = base->next; | |
} | |
} | |
//print_list(result, level); | |
//printf("\n"); | |
return result; | |
} | |
int countcmp(word *a, word *b) | |
{ | |
return b->c - a->c; | |
} | |
// free the memory for a linked list of words | |
void free_words(word *words) | |
{ | |
word *current_word; | |
for (current_word = words; current_word != NULL; ) { | |
word *next = current_word->next; | |
free(current_word->str); | |
free(current_word); | |
current_word = next; | |
} | |
} | |
void read_file(char *path, hashset *hs) | |
{ | |
FILE *fh = fopen(path, "r"); | |
if (fh == NULL) { | |
return; | |
} | |
int pos = 0, wc = 0; | |
char buffer[1024]; | |
int c = fgetc(fh); | |
do { | |
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { | |
buffer[pos] = c; | |
pos++; | |
} | |
else if (pos > 0) { | |
buffer[pos] = '\0'; | |
//printf("%s\n", buffer); | |
hashset_insert(hs, buffer, 1, 1); | |
memset(buffer, 0, sizeof(buffer)); | |
wc++; | |
pos = 0; | |
} | |
} while ((c = fgetc(fh)) != EOF); | |
printf("wc: %d\n", wc); | |
fclose(fh); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
if (argc < 2) { | |
printf("usage: %s [file]\n", argv[0]); | |
return 0; | |
} | |
hashset hs; | |
hashset_init(&hs, 1001, 0.75); | |
char *path = argv[1]; | |
read_file(path, &hs); | |
hashset_iterator *it = hashset_newiterator(&hs); | |
char *key; | |
int count; | |
int unique = 0, total = 0; | |
word *words = NULL; | |
while ((count = hashset_iterate(it, &key)) != -1) { | |
//printf("%d %s\n", count, key); | |
word *newWord = (word*)malloc(sizeof(word)); | |
newWord->next = words; | |
newWord->str = key; | |
newWord->c = count; | |
words = newWord; | |
unique++; | |
total += count; | |
} | |
//printf("unique: %d, total: %d\n", unique, total); | |
//printf("collissions: %d\n", hs.collisions); | |
hashset_freeiterator(it); | |
hashset_free(&hs, 0); | |
words = merge_sort(words, unique, 0, countcmp); | |
// print result | |
word *current_word; | |
int j = 0; | |
for (current_word = words; current_word != NULL; current_word = current_word->next) { | |
if (j > 10) { | |
break; | |
} | |
printf("%s %d\n", current_word->str, current_word->c); | |
j++; | |
} | |
free_words(words); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment