Last active
June 13, 2021 19:57
-
-
Save charles-cooper/9e31d588dbcc15df81b96897dbfe7826 to your computer and use it in GitHub Desktop.
Naive version of wc which is 5x faster than the real wc. (Doesn't handle locales, utf-8, etc., just counts ASCII words).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Something I whipped up in half an hour. | |
// Depends on strcspn being super fast. | |
#define __USE_LARGEFILE64 | |
#define _LARGEFILE_SOURCE | |
#define _LARGEFILE64_SOURCE | |
#include <unistd.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <fcntl.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <stdlib.h> | |
#define PAGE_SIZE 4096 | |
#define READ_SIZE (16384*4) | |
#define likely(x) __builtin_expect((x),1) | |
#define unlikely(x) __builtin_expect((x),0) | |
// TODO find the actual list. | |
char *WORD_SEPARATORS = "\n\r\t "; | |
#define min(a,b) \ | |
({ __typeof__ (a) _a = (a); \ | |
__typeof__ (b) _b = (b); \ | |
_a < _b ? _a : _b; }) | |
int64_t fsize(int fd) { | |
struct stat64 stat; | |
fstat64(fd, &stat); | |
return stat.st_size; | |
} | |
#define IN_TOKEN 0 | |
#define IN_SEPARATOR 1 | |
int wc_helper(char *buf, int64_t *ret, int st) { | |
while (*buf != 0) { | |
if (likely(st == IN_TOKEN)) { | |
// advance until we reach the end of a word, incrementing ret | |
// (strcspn apparently uses sse intrinsics these days) | |
size_t x = strcspn(buf, WORD_SEPARATORS); | |
buf += x; | |
if (*buf) { | |
(*ret)++; | |
st = IN_SEPARATOR; | |
} | |
} else { | |
// advance until we hit another word. | |
buf += strspn(buf, WORD_SEPARATORS); | |
if (*buf) { | |
st = IN_TOKEN; | |
} | |
} | |
} | |
return st; | |
} | |
int64_t wc(char *fname) { | |
int fd = open(fname, O_RDONLY); | |
int64_t len = fsize(fd); | |
char buf[READ_SIZE+1]; | |
int64_t left = len; | |
int64_t ret = 0; | |
int st = 0; // whether or not we are in a separator or a token. | |
while (1) { | |
ssize_t count = min(left, READ_SIZE); | |
ssize_t r = read(fd, buf, count); | |
if (r < 0) { | |
perror("panic"); | |
return -1; | |
} | |
if (r == 0) { | |
return ret; | |
} | |
left -= r; | |
// add a null terminator so that "str" operations terminate properly | |
buf[r] = '\0'; | |
st = wc_helper(buf, &ret, st); | |
} | |
return ret; | |
} | |
int main(int argc, char **argv) { | |
if (argc < 2) { | |
fprintf(stderr, "Must have at least one argument!"); | |
exit(1); | |
} | |
char *fname = argv[1]; | |
int64_t res = wc(fname); | |
if (res < 0) { | |
return res; | |
} | |
printf("%ld\n", res); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment