Skip to content

Instantly share code, notes, and snippets.

@charles-cooper
Last active June 13, 2021 19:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save charles-cooper/9e31d588dbcc15df81b96897dbfe7826 to your computer and use it in GitHub Desktop.
Save charles-cooper/9e31d588dbcc15df81b96897dbfe7826 to your computer and use it in GitHub Desktop.
Naive version of wc which is 5x faster than the real wc. (Doesn't handle locales, utf-8, etc., just counts ASCII words).
// Something I whipped up in half an hour.
// Depends on strcspn being super fast.
#define __USE_LARGEFILE64
#define _LARGEFILE_SOURCE
#define _LARGEFILE64_SOURCE
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdlib.h>
#define PAGE_SIZE 4096
#define READ_SIZE (16384*4)
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
// TODO find the actual list.
char *WORD_SEPARATORS = "\n\r\t ";
#define min(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b; })
int64_t fsize(int fd) {
struct stat64 stat;
fstat64(fd, &stat);
return stat.st_size;
}
#define IN_TOKEN 0
#define IN_SEPARATOR 1
int wc_helper(char *buf, int64_t *ret, int st) {
while (*buf != 0) {
if (likely(st == IN_TOKEN)) {
// advance until we reach the end of a word, incrementing ret
// (strcspn apparently uses sse intrinsics these days)
size_t x = strcspn(buf, WORD_SEPARATORS);
buf += x;
if (*buf) {
(*ret)++;
st = IN_SEPARATOR;
}
} else {
// advance until we hit another word.
buf += strspn(buf, WORD_SEPARATORS);
if (*buf) {
st = IN_TOKEN;
}
}
}
return st;
}
int64_t wc(char *fname) {
int fd = open(fname, O_RDONLY);
int64_t len = fsize(fd);
char buf[READ_SIZE+1];
int64_t left = len;
int64_t ret = 0;
int st = 0; // whether or not we are in a separator or a token.
while (1) {
ssize_t count = min(left, READ_SIZE);
ssize_t r = read(fd, buf, count);
if (r < 0) {
perror("panic");
return -1;
}
if (r == 0) {
return ret;
}
left -= r;
// add a null terminator so that "str" operations terminate properly
buf[r] = '\0';
st = wc_helper(buf, &ret, st);
}
return ret;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Must have at least one argument!");
exit(1);
}
char *fname = argv[1];
int64_t res = wc(fname);
if (res < 0) {
return res;
}
printf("%ld\n", res);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment