EmilHernvall/wordcount.c

## wordcount.c
// this has hacky support for swedish characters in utf8.
// i do not recommend this solution for any serious purposes. :)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>

// convert a twobyte utf8 character into a single integer
#define MB(str) (((str[0] & 0xFF) << 8) | (str[1] & 0xFF))

// our linked list for words
typedef struct word_ {
	char *str;
	int c;
	struct word_ *next;
} word;

void *xmalloc(size_t size)
{
	void *res = malloc(size);
	if (res == NULL) {
		fprintf(stderr, "Memory allocation failed\n");
		abort();
	}
	return res;
}

void *xstrdup(char *str)
{
	char *res = strdup(str);
	if (res == NULL) {
		fprintf(stderr, "Memory allocation failed\n");
		abort();
	}
	return res;
}

// read a character from file, if it detects multibyte characters
// it merges them into a single integer
int readchar(FILE *fh)
{
	int c = fgetc(fh);
	if (c == EOF) {
		return c;
	}

	// all characters larger than 128 is unicode characters
	if (c > 0x80) {
		int c2 = fgetc(fh);
		if (c2 == EOF) {
			return c;
		}

		c = ((c & 0xFF) << 8) | (c2 & 0xFF);
	}

	return c;
}

// read a file from disk, and return a linked list
// containing all the words in the file
int get_words_ascii(word **words, char *path)
{
	FILE *fh = fopen(path, "r");
	if (!fh) {
		return -1;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	unsigned int c = fgetc(fh);
	word *current_word = NULL;
	do {
		// look for letters
		if ((c >= 'a' && c <= 'z') ||
				(c >= 'A' && c <= 'Z') ||
				c == 0x86 || c == 0x8F ||
				c == 0x84 || c == 0x8E ||
				c == 0x94 || c == 0x99
			) {
			buffer[pos] = c;
			pos++;
		}
		else if (pos > 0) {
			buffer[pos] = 0;

			word *new_word = (word*)xmalloc(sizeof(word));
			new_word->str = xstrdup(buffer);
			new_word->c = 1;
			new_word->next = current_word;

			current_word = new_word;

			memset(buffer, 0, sizeof(buffer));
			pos = 0;
			wc++;
		}
	} while ((c = fgetc(fh)) != EOF);

	fclose(fh);

	*words = current_word;

	return wc;
}

int get_words_utf8(word **words, char *path)
{
	FILE *fh = fopen(path, "r");
	if (!fh) {
		return -1;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	unsigned int c = readchar(fh);
	word *current_word = NULL;
	do {
		// look for letters
		if ((c >= 'a' && c <= 'z') ||
				(c >= 'A' && c <= 'Z') ||
				c == 0xC3A5 || c == 0xC385 ||
				c == 0xC3A4 || c == 0xC384 ||
				c == 0xC3B6 || c == 0xC396
			) {
			if (c > 0x80) {
				buffer[pos] = c >> 8;
				buffer[pos+1] = c & 0xFF;
				pos++;
			} else {
				buffer[pos] = c;
			}
			pos++;
		}
		else if (pos > 0) {
			buffer[pos] = 0;

			word *new_word = (word*)xmalloc(sizeof(word));
			new_word->str = xstrdup(buffer);
			new_word->c = 1;
			new_word->next = current_word;

			current_word = new_word;

			memset(buffer, 0, sizeof(buffer));
			pos = 0;
			wc++;
		}
	} while ((c = readchar(fh)) != EOF);

	fclose(fh);

	*words = current_word;

	return wc;
}

// free the memory for a linked list of words
void free_words(word *words)
{
		word *current_word;
		for (current_word = words; current_word != NULL; ) {
			word *next = current_word->next;
			free(current_word->str);
			free(current_word);
			current_word = next;
		}
}

// print a list of words. level can be used to specify
// indentation
void print_list(word *words, int level)
{
	word *current_word;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
		int j;
		for (j = 0; j < level; j++) {
			printf("\t");
		}
		printf("%s %d\n", current_word->str, current_word->c);
	}
}

// sort a list of words using a custom comparison function
word *merge_sort(word *words, int count, int level, int (*cmpfunc)(word*, word*))
{
	if (words == NULL) {
		return words;
	}

	//printf("%d\n", count);
	//print_list(words, level);

	if (count == 1) {
		words->next = NULL;
		return words;
	}

	// split the list into two parts
	int split = count / 2;

	word *left, *right;
	word *current_word;
	int i = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
		if (i == split - 1 || current_word->next == NULL) {
			left = words;
			right = current_word->next;
			current_word->next = NULL;
			break;
		}
		i++;
	}

	// sort recursively
	word *base = merge_sort(left, split, level+1, cmpfunc);
	word *cmp = merge_sort(right, count-split, level+1, cmpfunc);

	//printf("left: %d\n", left->next == NULL);
	//printf("right: %d\n", right->next == NULL);

	// merge the two lists
	if (cmpfunc(base, cmp) > 0) {
		word *tmp = base;
		base = cmp;
		cmp = tmp;
	}

	word *result = base;
	while (cmp != NULL) {
		//printf("%s\n", cmp->str);
		if (base->next == NULL) {
			base->next = cmp;
			break;
		}

		//printf("cmp: %s %s\n", base->next->str, cmp->str);
		if (cmpfunc(base->next, cmp) > 0) {
			word *next = cmp->next;
			cmp->next = base->next;
			base->next = cmp;
			cmp = next;
		}
		else {
			base = base->next;
		}
	}

	//print_list(result, level);
	//printf("\n");

	return result;
}

// take a sorted linked list of words, and merge all duplicates
// incrementing the counter
int reduce(word *words)
{
	word *current_word, *next;
	int totalCount = 0;
	for (current_word = words; current_word != NULL; ) {
		for (next = current_word->next; next != NULL; ) {
			if (strcmp(current_word->str, next->str) != 0) {
				break;
			}
			current_word->c++;
			word *tmp = next->next;
			free(next->str);
			free(next);
			next = tmp;
		}

		totalCount++;
		current_word->next = next;
		current_word = next;
	}

	return totalCount;
}

int namecmp(word *a, word *b)
{
	return strcmp(b->str, a->str);
}

int countcmp(word *a, word *b)
{
	return b->c - a->c;
}

int main(int argc, char **argv)
{
	if (argc < 2) {
		printf("usage: wordcount [file]\n");
		return 0;
	}

	int i;
	for (i = 1; i < argc; i++) {
		word *words;

		// read words from file
		int wordCount = get_words_ascii(&words, argv[i]);
		if (wordCount == -1) {
			continue;
		}

		printf("wordcount for %s: %d\n", argv[i], wordCount);

		// first sorting pass: sort the words alphabetically
		words = merge_sort(words, wordCount, 0, namecmp);

		// eliminate duplicates
		int newCount = reduce(words);

		// sort by word frequency
		words = merge_sort(words, newCount, 0, countcmp);

		// print result
		word *current_word;
		int j = 0;
		for (current_word = words; current_word != NULL; current_word = current_word->next) {
			if (j > 10) {
				break;
			}

			printf("%s %d\n", current_word->str, current_word->c);
			j++;
		}

		free_words(words);

		printf("\n");
	}

	return 0;
}
	// this has hacky support for swedish characters in utf8.
	// i do not recommend this solution for any serious purposes. :)

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <wchar.h>

	// convert a twobyte utf8 character into a single integer
	#define MB(str) (((str[0] & 0xFF) << 8) \| (str[1] & 0xFF))

	// our linked list for words
	typedef struct word_ {
	char *str;
	int c;
	struct word_ *next;
	} word;

	void *xmalloc(size_t size)
	{
	void *res = malloc(size);
	if (res == NULL) {
	fprintf(stderr, "Memory allocation failed\n");
	abort();
	}
	return res;
	}

	void xstrdup(char str)
	{
	char *res = strdup(str);
	if (res == NULL) {
	fprintf(stderr, "Memory allocation failed\n");
	abort();
	}
	return res;
	}

	// read a character from file, if it detects multibyte characters
	// it merges them into a single integer
	int readchar(FILE *fh)
	{
	int c = fgetc(fh);
	if (c == EOF) {
	return c;
	}

	// all characters larger than 128 is unicode characters
	if (c > 0x80) {
	int c2 = fgetc(fh);
	if (c2 == EOF) {
	return c;
	}

	c = ((c & 0xFF) << 8) \| (c2 & 0xFF);
	}

	return c;
	}

	// read a file from disk, and return a linked list
	// containing all the words in the file
	int get_words_ascii(word *words, char path)
	{
	FILE *fh = fopen(path, "r");
	if (!fh) {
	return -1;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	unsigned int c = fgetc(fh);
	word *current_word = NULL;
	do {
	// look for letters
	if ((c >= 'a' && c <= 'z') \|\|
	(c >= 'A' && c <= 'Z') \|\|
	c == 0x86 \|\| c == 0x8F \|\|
	c == 0x84 \|\| c == 0x8E \|\|
	c == 0x94 \|\| c == 0x99
	) {
	buffer[pos] = c;
	pos++;
	}
	else if (pos > 0) {
	buffer[pos] = 0;

	word new_word = (word)xmalloc(sizeof(word));
	new_word->str = xstrdup(buffer);
	new_word->c = 1;
	new_word->next = current_word;

	current_word = new_word;

	memset(buffer, 0, sizeof(buffer));
	pos = 0;
	wc++;
	}
	} while ((c = fgetc(fh)) != EOF);

	fclose(fh);

	*words = current_word;

	return wc;
	}

	int get_words_utf8(word *words, char path)
	{
	FILE *fh = fopen(path, "r");
	if (!fh) {
	return -1;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	unsigned int c = readchar(fh);
	word *current_word = NULL;
	do {
	// look for letters
	if ((c >= 'a' && c <= 'z') \|\|
	(c >= 'A' && c <= 'Z') \|\|
	c == 0xC3A5 \|\| c == 0xC385 \|\|
	c == 0xC3A4 \|\| c == 0xC384 \|\|
	c == 0xC3B6 \|\| c == 0xC396
	) {
	if (c > 0x80) {
	buffer[pos] = c >> 8;
	buffer[pos+1] = c & 0xFF;
	pos++;
	} else {
	buffer[pos] = c;
	}
	pos++;
	}
	else if (pos > 0) {
	buffer[pos] = 0;

	word new_word = (word)xmalloc(sizeof(word));
	new_word->str = xstrdup(buffer);
	new_word->c = 1;
	new_word->next = current_word;

	current_word = new_word;

	memset(buffer, 0, sizeof(buffer));
	pos = 0;
	wc++;
	}
	} while ((c = readchar(fh)) != EOF);

	fclose(fh);

	*words = current_word;

	return wc;
	}

	// free the memory for a linked list of words
	void free_words(word *words)
	{
	word *current_word;
	for (current_word = words; current_word != NULL; ) {
	word *next = current_word->next;
	free(current_word->str);
	free(current_word);
	current_word = next;
	}
	}

	// print a list of words. level can be used to specify
	// indentation
	void print_list(word *words, int level)
	{
	word *current_word;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	int j;
	for (j = 0; j < level; j++) {
	printf("\t");
	}
	printf("%s %d\n", current_word->str, current_word->c);
	}
	}

	// sort a list of words using a custom comparison function
	word merge_sort(word words, int count, int level, int (cmpfunc)(word, word*))
	{
	if (words == NULL) {
	return words;
	}

	//printf("%d\n", count);
	//print_list(words, level);

	if (count == 1) {
	words->next = NULL;
	return words;
	}

	// split the list into two parts
	int split = count / 2;

	word left, right;
	word *current_word;
	int i = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	if (i == split - 1 \|\| current_word->next == NULL) {
	left = words;
	right = current_word->next;
	current_word->next = NULL;
	break;
	}
	i++;
	}

	// sort recursively
	word *base = merge_sort(left, split, level+1, cmpfunc);
	word *cmp = merge_sort(right, count-split, level+1, cmpfunc);

	//printf("left: %d\n", left->next == NULL);
	//printf("right: %d\n", right->next == NULL);

	// merge the two lists
	if (cmpfunc(base, cmp) > 0) {
	word *tmp = base;
	base = cmp;
	cmp = tmp;
	}

	word *result = base;
	while (cmp != NULL) {
	//printf("%s\n", cmp->str);
	if (base->next == NULL) {
	base->next = cmp;
	break;
	}

	//printf("cmp: %s %s\n", base->next->str, cmp->str);
	if (cmpfunc(base->next, cmp) > 0) {
	word *next = cmp->next;
	cmp->next = base->next;
	base->next = cmp;
	cmp = next;
	}
	else {
	base = base->next;
	}
	}

	//print_list(result, level);
	//printf("\n");

	return result;
	}

	// take a sorted linked list of words, and merge all duplicates
	// incrementing the counter
	int reduce(word *words)
	{
	word current_word, next;
	int totalCount = 0;
	for (current_word = words; current_word != NULL; ) {
	for (next = current_word->next; next != NULL; ) {
	if (strcmp(current_word->str, next->str) != 0) {
	break;
	}
	current_word->c++;
	word *tmp = next->next;
	free(next->str);
	free(next);
	next = tmp;
	}

	totalCount++;
	current_word->next = next;
	current_word = next;
	}

	return totalCount;
	}

	int namecmp(word a, word b)
	{
	return strcmp(b->str, a->str);
	}

	int countcmp(word a, word b)
	{
	return b->c - a->c;
	}

	int main(int argc, char **argv)
	{
	if (argc < 2) {
	printf("usage: wordcount [file]\n");
	return 0;
	}

	int i;
	for (i = 1; i < argc; i++) {
	word *words;

	// read words from file
	int wordCount = get_words_ascii(&words, argv[i]);
	if (wordCount == -1) {
	continue;
	}

	printf("wordcount for %s: %d\n", argv[i], wordCount);

	// first sorting pass: sort the words alphabetically
	words = merge_sort(words, wordCount, 0, namecmp);

	// eliminate duplicates
	int newCount = reduce(words);

	// sort by word frequency
	words = merge_sort(words, newCount, 0, countcmp);

	// print result
	word *current_word;
	int j = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	if (j > 10) {
	break;
	}

	printf("%s %d\n", current_word->str, current_word->c);
	j++;
	}

	free_words(words);

	printf("\n");
	}

	return 0;
	}