Skip to content

Instantly share code, notes, and snippets.

@EmilHernvall
Created May 3, 2011 17:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EmilHernvall/953744 to your computer and use it in GitHub Desktop.
Save EmilHernvall/953744 to your computer and use it in GitHub Desktop.
Find the most popular words in a file - Linked list and merge sort based version
// this has hacky support for swedish characters in utf8.
// i do not recommend this solution for any serious purposes. :)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
// convert a twobyte utf8 character into a single integer
#define MB(str) (((str[0] & 0xFF) << 8) | (str[1] & 0xFF))
// our linked list for words
typedef struct word_ {
char *str;
int c;
struct word_ *next;
} word;
void *xmalloc(size_t size)
{
void *res = malloc(size);
if (res == NULL) {
fprintf(stderr, "Memory allocation failed\n");
abort();
}
return res;
}
void *xstrdup(char *str)
{
char *res = strdup(str);
if (res == NULL) {
fprintf(stderr, "Memory allocation failed\n");
abort();
}
return res;
}
// read a character from file, if it detects multibyte characters
// it merges them into a single integer
int readchar(FILE *fh)
{
int c = fgetc(fh);
if (c == EOF) {
return c;
}
// all characters larger than 128 is unicode characters
if (c > 0x80) {
int c2 = fgetc(fh);
if (c2 == EOF) {
return c;
}
c = ((c & 0xFF) << 8) | (c2 & 0xFF);
}
return c;
}
// read a file from disk, and return a linked list
// containing all the words in the file
int get_words_ascii(word **words, char *path)
{
FILE *fh = fopen(path, "r");
if (!fh) {
return -1;
}
int pos = 0, wc = 0;
char buffer[1024];
unsigned int c = fgetc(fh);
word *current_word = NULL;
do {
// look for letters
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == 0x86 || c == 0x8F ||
c == 0x84 || c == 0x8E ||
c == 0x94 || c == 0x99
) {
buffer[pos] = c;
pos++;
}
else if (pos > 0) {
buffer[pos] = 0;
word *new_word = (word*)xmalloc(sizeof(word));
new_word->str = xstrdup(buffer);
new_word->c = 1;
new_word->next = current_word;
current_word = new_word;
memset(buffer, 0, sizeof(buffer));
pos = 0;
wc++;
}
} while ((c = fgetc(fh)) != EOF);
fclose(fh);
*words = current_word;
return wc;
}
int get_words_utf8(word **words, char *path)
{
FILE *fh = fopen(path, "r");
if (!fh) {
return -1;
}
int pos = 0, wc = 0;
char buffer[1024];
unsigned int c = readchar(fh);
word *current_word = NULL;
do {
// look for letters
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == 0xC3A5 || c == 0xC385 ||
c == 0xC3A4 || c == 0xC384 ||
c == 0xC3B6 || c == 0xC396
) {
if (c > 0x80) {
buffer[pos] = c >> 8;
buffer[pos+1] = c & 0xFF;
pos++;
} else {
buffer[pos] = c;
}
pos++;
}
else if (pos > 0) {
buffer[pos] = 0;
word *new_word = (word*)xmalloc(sizeof(word));
new_word->str = xstrdup(buffer);
new_word->c = 1;
new_word->next = current_word;
current_word = new_word;
memset(buffer, 0, sizeof(buffer));
pos = 0;
wc++;
}
} while ((c = readchar(fh)) != EOF);
fclose(fh);
*words = current_word;
return wc;
}
// free the memory for a linked list of words
void free_words(word *words)
{
word *current_word;
for (current_word = words; current_word != NULL; ) {
word *next = current_word->next;
free(current_word->str);
free(current_word);
current_word = next;
}
}
// print a list of words. level can be used to specify
// indentation
void print_list(word *words, int level)
{
word *current_word;
for (current_word = words; current_word != NULL; current_word = current_word->next) {
int j;
for (j = 0; j < level; j++) {
printf("\t");
}
printf("%s %d\n", current_word->str, current_word->c);
}
}
// sort a list of words using a custom comparison function
word *merge_sort(word *words, int count, int level, int (*cmpfunc)(word*, word*))
{
if (words == NULL) {
return words;
}
//printf("%d\n", count);
//print_list(words, level);
if (count == 1) {
words->next = NULL;
return words;
}
// split the list into two parts
int split = count / 2;
word *left, *right;
word *current_word;
int i = 0;
for (current_word = words; current_word != NULL; current_word = current_word->next) {
if (i == split - 1 || current_word->next == NULL) {
left = words;
right = current_word->next;
current_word->next = NULL;
break;
}
i++;
}
// sort recursively
word *base = merge_sort(left, split, level+1, cmpfunc);
word *cmp = merge_sort(right, count-split, level+1, cmpfunc);
//printf("left: %d\n", left->next == NULL);
//printf("right: %d\n", right->next == NULL);
// merge the two lists
if (cmpfunc(base, cmp) > 0) {
word *tmp = base;
base = cmp;
cmp = tmp;
}
word *result = base;
while (cmp != NULL) {
//printf("%s\n", cmp->str);
if (base->next == NULL) {
base->next = cmp;
break;
}
//printf("cmp: %s %s\n", base->next->str, cmp->str);
if (cmpfunc(base->next, cmp) > 0) {
word *next = cmp->next;
cmp->next = base->next;
base->next = cmp;
cmp = next;
}
else {
base = base->next;
}
}
//print_list(result, level);
//printf("\n");
return result;
}
// take a sorted linked list of words, and merge all duplicates
// incrementing the counter
int reduce(word *words)
{
word *current_word, *next;
int totalCount = 0;
for (current_word = words; current_word != NULL; ) {
for (next = current_word->next; next != NULL; ) {
if (strcmp(current_word->str, next->str) != 0) {
break;
}
current_word->c++;
word *tmp = next->next;
free(next->str);
free(next);
next = tmp;
}
totalCount++;
current_word->next = next;
current_word = next;
}
return totalCount;
}
int namecmp(word *a, word *b)
{
return strcmp(b->str, a->str);
}
int countcmp(word *a, word *b)
{
return b->c - a->c;
}
int main(int argc, char **argv)
{
if (argc < 2) {
printf("usage: wordcount [file]\n");
return 0;
}
int i;
for (i = 1; i < argc; i++) {
word *words;
// read words from file
int wordCount = get_words_ascii(&words, argv[i]);
if (wordCount == -1) {
continue;
}
printf("wordcount for %s: %d\n", argv[i], wordCount);
// first sorting pass: sort the words alphabetically
words = merge_sort(words, wordCount, 0, namecmp);
// eliminate duplicates
int newCount = reduce(words);
// sort by word frequency
words = merge_sort(words, newCount, 0, countcmp);
// print result
word *current_word;
int j = 0;
for (current_word = words; current_word != NULL; current_word = current_word->next) {
if (j > 10) {
break;
}
printf("%s %d\n", current_word->str, current_word->c);
j++;
}
free_words(words);
printf("\n");
}
return 0;
}
@EmilHernvall
Copy link
Author

Turns out this isn't nearly as efficient as the hashtable approach in https://gist.github.com/953745 . In fact, a perl script performed slightly better than this implementation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment