EmilHernvall/wordcount2.c

## wordcount2.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct word_ {
	char *str;
	int c;
	struct word_ *next;
} word;

typedef struct bucket_ {
	char **key;
	int *count;
	int size;
} bucket;

typedef struct hashset_ {
	int size;
	int count;
	int collisions;
	double loadfactor;
	bucket *buckets;
} hashset;

typedef struct hashset_iterator_ {
	hashset *hs;
	int init;
	int bucket;
	int pos;
} hashset_iterator;

unsigned int hashset_keyfunc(char *key);
void hashset_init(hashset *hs, int size, double loadfactor);
void hashset_rehash(hashset *hs);
void hashset_insert(hashset *hs, char *key, int initCount, int copyKey);
hashset_iterator *hashset_newiterator(hashset *hs);
int hashset_iterate(hashset_iterator *it, char **key);
void hashset_freeiterator(hashset_iterator *it);
void hashset_free(hashset *hs, int freeKeys);

unsigned int hashset_keyfunc(char *key)
{
	unsigned int hash = 17;
	while (*key) {
		hash = 31 * hash + *key;
		key++;
	}

	return hash;
}

void hashset_init(hashset *hs, int size, double loadfactor)
{
	hs->size = size;
	hs->loadfactor = loadfactor;
	hs->count = 0;
	hs->collisions = 0;
	hs->buckets = (bucket*)malloc(sizeof(bucket) * size);
	memset(hs->buckets, 0, sizeof(bucket) * size);
}

void hashset_rehash(hashset *hs)
{
	int newSize = hs->size * 2;

	hashset newHs;
	hashset_init(&newHs, newSize, hs->loadfactor);

	hashset_iterator *it = hashset_newiterator(hs);

	char *key;
	int count;
	while ((count = hashset_iterate(it, &key)) != -1) {
		hashset_insert(&newHs, key, count, 0);
	}

	hashset_freeiterator(it);

	free(hs->buckets);

	memcpy(hs, &newHs, sizeof(hashset));
}

void hashset_insert(hashset *hs, char *key, int initCount, int copyKey)
{
	if (hs->count > hs->size * hs->loadfactor) {
		//printf("Rehashing\n");
		hashset_rehash(hs);
	}

	unsigned int hash = hashset_keyfunc(key);
	unsigned int pos = (1001 + hash) % hs->size;

	bucket *bucket = &hs->buckets[pos];

	int i;
	for (i = 0; i < bucket->size; i++) {
		if (strcmp(key, bucket->key[i]) == 0) {
			bucket->count[i]++;
			return;
		}
	}

	if (bucket->size >= 1) {
		hs->collisions++;
	}

	hs->count++;

	int newSize = ++bucket->size;
	bucket->key = (char**)realloc(bucket->key, sizeof(char*) * newSize);
	bucket->count = (int*)realloc(bucket->count, sizeof(int*) * newSize);

	bucket->key[newSize-1] = copyKey ? strdup(key) : key;
	bucket->count[newSize-1] = initCount;
}

hashset_iterator *hashset_newiterator(hashset *hs)
{
	hashset_iterator *it = (hashset_iterator*)malloc(sizeof(hashset_iterator));
	it->hs = hs;
	it->init = 1;
	it->bucket = 0;
	it->pos = 0;

	return it;
}

int hashset_iterate(hashset_iterator *it, char **key)
{
	hashset *hs = it->hs;

	int i, j;
	for (i = it->bucket; i < hs->size; i++) {
		bucket *bucket = &hs->buckets[i];
		for (j = it->pos; j < bucket->size; j++) {
			it->bucket = i;
			it->pos = j + 1;
			*key = bucket->key[j];
			return bucket->count[j];
		}

		it->pos = 0;
	}

	return -1;
}

void hashset_freeiterator(hashset_iterator *it)
{
	free(it);
}

void hashset_free(hashset *hs, int freeKeys)
{
	int i, j;
	for (i = 0; i < hs->size; i++) {
		bucket *bucket = &hs->buckets[i];
		if (freeKeys) {
			for (j = 0; j < bucket->size; j++) {
				free(bucket->key[j]);
			}
		}
		free(bucket->count);
	}

	free(hs->buckets);
}

word *merge_sort(word *words, int count, int level, int (*cmpfunc)(word*, word*))
{
	if (words == NULL) {
		return words;
	}

	//printf("%d\n", count);
	//print_list(words, level);

	if (count == 1) {
		words->next = NULL;
		return words;
	}

	// split the list into two parts
	int split = count / 2;

	word *left, *right;
	word *current_word;
	int i = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
		if (i == split - 1 || current_word->next == NULL) {
			left = words;
			right = current_word->next;
			current_word->next = NULL;
			break;
		}
		i++;
	}

	// sort recursively
	word *base = merge_sort(left, split, level+1, cmpfunc);
	word *cmp = merge_sort(right, count-split, level+1, cmpfunc);

	//printf("left: %d\n", left->next == NULL);
	//printf("right: %d\n", right->next == NULL);

	// merge the two lists
	if (cmpfunc(base, cmp) > 0) {
		word *tmp = base;
		base = cmp;
		cmp = tmp;
	}

	word *result = base;
	while (cmp != NULL) {
		//printf("%s\n", cmp->str);
		if (base->next == NULL) {
			base->next = cmp;
			break;
		}

		//printf("cmp: %s %s\n", base->next->str, cmp->str);
		if (cmpfunc(base->next, cmp) > 0) {
			word *next = cmp->next;
			cmp->next = base->next;
			base->next = cmp;
			cmp = next;
		}
		else {
			base = base->next;
		}
	}

	//print_list(result, level);
	//printf("\n");

	return result;
}

int countcmp(word *a, word *b)
{
	return b->c - a->c;
}

// free the memory for a linked list of words
void free_words(word *words)
{
		word *current_word;
		for (current_word = words; current_word != NULL; ) {
			word *next = current_word->next;
			free(current_word->str);
			free(current_word);
			current_word = next;
		}
}

void read_file(char *path, hashset *hs)
{
	FILE *fh = fopen(path, "r");
	if (fh == NULL) {
		return;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	int c = fgetc(fh);
	do {
		if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
			buffer[pos] = c;
			pos++;
		}
		else if (pos > 0) {
			buffer[pos] = '\0';
			//printf("%s\n", buffer);

			hashset_insert(hs, buffer, 1, 1);

			memset(buffer, 0, sizeof(buffer));

			wc++;
			pos = 0;
		}
	} while ((c = fgetc(fh)) != EOF);

	printf("wc: %d\n", wc);

	fclose(fh);
}

int main(int argc, char* argv[])
{
	if (argc < 2) {
		printf("usage: %s [file]\n", argv[0]);
		return 0;
	}

	hashset hs;
	hashset_init(&hs, 1001, 0.75);

	char *path = argv[1];
	read_file(path, &hs);

	hashset_iterator *it = hashset_newiterator(&hs);

	char *key;
	int count;
	int unique = 0, total = 0;
	word *words = NULL;
	while ((count = hashset_iterate(it, &key)) != -1) {
		//printf("%d %s\n", count, key);

		word *newWord = (word*)malloc(sizeof(word));
		newWord->next = words;
		newWord->str = key;
		newWord->c = count;
		words = newWord;

		unique++;
		total += count;
	}

	//printf("unique: %d, total: %d\n", unique, total);
	//printf("collissions: %d\n", hs.collisions);

	hashset_freeiterator(it);

	hashset_free(&hs, 0);

	words = merge_sort(words, unique, 0, countcmp);

	// print result
	word *current_word;
	int j = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
		if (j > 10) {
			break;
		}

		printf("%s %d\n", current_word->str, current_word->c);
		j++;
	}

	free_words(words);

	return 0;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	typedef struct word_ {
	char *str;
	int c;
	struct word_ *next;
	} word;

	typedef struct bucket_ {
	char **key;
	int *count;
	int size;
	} bucket;

	typedef struct hashset_ {
	int size;
	int count;
	int collisions;
	double loadfactor;
	bucket *buckets;
	} hashset;

	typedef struct hashset_iterator_ {
	hashset *hs;
	int init;
	int bucket;
	int pos;
	} hashset_iterator;

	unsigned int hashset_keyfunc(char *key);
	void hashset_init(hashset *hs, int size, double loadfactor);
	void hashset_rehash(hashset *hs);
	void hashset_insert(hashset hs, char key, int initCount, int copyKey);
	hashset_iterator hashset_newiterator(hashset hs);
	int hashset_iterate(hashset_iterator it, char *key);
	void hashset_freeiterator(hashset_iterator *it);
	void hashset_free(hashset *hs, int freeKeys);

	unsigned int hashset_keyfunc(char *key)
	{
	unsigned int hash = 17;
	while (*key) {
	hash = 31 * hash + *key;
	key++;
	}

	return hash;
	}

	void hashset_init(hashset *hs, int size, double loadfactor)
	{
	hs->size = size;
	hs->loadfactor = loadfactor;
	hs->count = 0;
	hs->collisions = 0;
	hs->buckets = (bucket)malloc(sizeof(bucket) size);
	memset(hs->buckets, 0, sizeof(bucket) * size);
	}

	void hashset_rehash(hashset *hs)
	{
	int newSize = hs->size * 2;

	hashset newHs;
	hashset_init(&newHs, newSize, hs->loadfactor);

	hashset_iterator *it = hashset_newiterator(hs);

	char *key;
	int count;
	while ((count = hashset_iterate(it, &key)) != -1) {
	hashset_insert(&newHs, key, count, 0);
	}

	hashset_freeiterator(it);

	free(hs->buckets);

	memcpy(hs, &newHs, sizeof(hashset));
	}

	void hashset_insert(hashset hs, char key, int initCount, int copyKey)
	{
	if (hs->count > hs->size * hs->loadfactor) {
	//printf("Rehashing\n");
	hashset_rehash(hs);
	}

	unsigned int hash = hashset_keyfunc(key);
	unsigned int pos = (1001 + hash) % hs->size;

	bucket *bucket = &hs->buckets[pos];

	int i;
	for (i = 0; i < bucket->size; i++) {
	if (strcmp(key, bucket->key[i]) == 0) {
	bucket->count[i]++;
	return;
	}
	}

	if (bucket->size >= 1) {
	hs->collisions++;
	}

	hs->count++;

	int newSize = ++bucket->size;
	bucket->key = (char*)realloc(bucket->key, sizeof(char) * newSize);
	bucket->count = (int)realloc(bucket->count, sizeof(int) * newSize);

	bucket->key[newSize-1] = copyKey ? strdup(key) : key;
	bucket->count[newSize-1] = initCount;
	}

	hashset_iterator hashset_newiterator(hashset hs)
	{
	hashset_iterator it = (hashset_iterator)malloc(sizeof(hashset_iterator));
	it->hs = hs;
	it->init = 1;
	it->bucket = 0;
	it->pos = 0;

	return it;
	}

	int hashset_iterate(hashset_iterator it, char *key)
	{
	hashset *hs = it->hs;

	int i, j;
	for (i = it->bucket; i < hs->size; i++) {
	bucket *bucket = &hs->buckets[i];
	for (j = it->pos; j < bucket->size; j++) {
	it->bucket = i;
	it->pos = j + 1;
	*key = bucket->key[j];
	return bucket->count[j];
	}

	it->pos = 0;
	}

	return -1;
	}

	void hashset_freeiterator(hashset_iterator *it)
	{
	free(it);
	}

	void hashset_free(hashset *hs, int freeKeys)
	{
	int i, j;
	for (i = 0; i < hs->size; i++) {
	bucket *bucket = &hs->buckets[i];
	if (freeKeys) {
	for (j = 0; j < bucket->size; j++) {
	free(bucket->key[j]);
	}
	}
	free(bucket->count);
	}

	free(hs->buckets);
	}

	word merge_sort(word words, int count, int level, int (cmpfunc)(word, word*))
	{
	if (words == NULL) {
	return words;
	}

	//printf("%d\n", count);
	//print_list(words, level);

	if (count == 1) {
	words->next = NULL;
	return words;
	}

	// split the list into two parts
	int split = count / 2;

	word left, right;
	word *current_word;
	int i = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	if (i == split - 1 \|\| current_word->next == NULL) {
	left = words;
	right = current_word->next;
	current_word->next = NULL;
	break;
	}
	i++;
	}

	// sort recursively
	word *base = merge_sort(left, split, level+1, cmpfunc);
	word *cmp = merge_sort(right, count-split, level+1, cmpfunc);

	//printf("left: %d\n", left->next == NULL);
	//printf("right: %d\n", right->next == NULL);

	// merge the two lists
	if (cmpfunc(base, cmp) > 0) {
	word *tmp = base;
	base = cmp;
	cmp = tmp;
	}

	word *result = base;
	while (cmp != NULL) {
	//printf("%s\n", cmp->str);
	if (base->next == NULL) {
	base->next = cmp;
	break;
	}

	//printf("cmp: %s %s\n", base->next->str, cmp->str);
	if (cmpfunc(base->next, cmp) > 0) {
	word *next = cmp->next;
	cmp->next = base->next;
	base->next = cmp;
	cmp = next;
	}
	else {
	base = base->next;
	}
	}

	//print_list(result, level);
	//printf("\n");

	return result;
	}

	int countcmp(word a, word b)
	{
	return b->c - a->c;
	}

	// free the memory for a linked list of words
	void free_words(word *words)
	{
	word *current_word;
	for (current_word = words; current_word != NULL; ) {
	word *next = current_word->next;
	free(current_word->str);
	free(current_word);
	current_word = next;
	}
	}

	void read_file(char path, hashset hs)
	{
	FILE *fh = fopen(path, "r");
	if (fh == NULL) {
	return;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	int c = fgetc(fh);
	do {
	if ((c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z')) {
	buffer[pos] = c;
	pos++;
	}
	else if (pos > 0) {
	buffer[pos] = '\0';
	//printf("%s\n", buffer);

	hashset_insert(hs, buffer, 1, 1);

	memset(buffer, 0, sizeof(buffer));

	wc++;
	pos = 0;
	}
	} while ((c = fgetc(fh)) != EOF);

	printf("wc: %d\n", wc);

	fclose(fh);
	}

	int main(int argc, char* argv[])
	{
	if (argc < 2) {
	printf("usage: %s [file]\n", argv[0]);
	return 0;
	}

	hashset hs;
	hashset_init(&hs, 1001, 0.75);

	char *path = argv[1];
	read_file(path, &hs);

	hashset_iterator *it = hashset_newiterator(&hs);

	char *key;
	int count;
	int unique = 0, total = 0;
	word *words = NULL;
	while ((count = hashset_iterate(it, &key)) != -1) {
	//printf("%d %s\n", count, key);

	word newWord = (word)malloc(sizeof(word));
	newWord->next = words;
	newWord->str = key;
	newWord->c = count;
	words = newWord;

	unique++;
	total += count;
	}

	//printf("unique: %d, total: %d\n", unique, total);
	//printf("collissions: %d\n", hs.collisions);

	hashset_freeiterator(it);

	hashset_free(&hs, 0);

	words = merge_sort(words, unique, 0, countcmp);

	// print result
	word *current_word;
	int j = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	if (j > 10) {
	break;
	}

	printf("%s %d\n", current_word->str, current_word->c);
	j++;
	}

	free_words(words);

	return 0;
	}