Created
March 3, 2020 08:48
-
-
Save kirugan/405a7d908e6b097ffc0962aec37ef3bc to your computer and use it in GitHub Desktop.
Fast frequency dictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <fcntl.h> | |
#include <stdbool.h> | |
#include <sys/stat.h> | |
#include <sys/mman.h> | |
typedef struct freq { | |
int freq; | |
char* word; | |
} freq; | |
inline bool is_letter(char c) { | |
return 'A' <= c && c <= 'Z'; | |
} | |
int main(int argc, char** argv) { | |
if (argc < 2) { | |
fprintf(stderr, "Wrong usage"); | |
return 1; | |
} | |
struct stat st; | |
int fd = open(argv[1], O_RDONLY); | |
fstat(fd, &st); | |
char* first_addr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); | |
char* last_addr = first_addr + st.st_size; | |
madvise(first_addr, st.st_size, MADV_SEQUENTIAL); | |
//void* words = malloc(sizeof(freq) * 1000); | |
char* addr = first_addr; | |
char c; | |
char* word_ptr = NULL; | |
for (; addr <= last_addr;++addr) { | |
c = *addr; | |
if (c > 'a') { | |
c -= ('a' - 'A'); | |
*addr = c; | |
} | |
continue; | |
if (is_letter(c)){ | |
if (word_ptr == NULL) { | |
word_ptr = addr; | |
} | |
} else { | |
if (word_ptr != NULL) { | |
// what about last word in memory? if there wont be any garbage at the end | |
*addr = '\0'; | |
word_ptr = NULL; | |
} | |
} | |
} | |
printf("last addr: %p\n", addr); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment