Skip to content

Instantly share code, notes, and snippets.

@a-square
Created October 21, 2019 00:15
Show Gist options
  • Save a-square/b8607f6bf5b6c40614ad100ccb2ae936 to your computer and use it in GitHub Desktop.
Save a-square/b8607f6bf5b6c40614ad100ccb2ae936 to your computer and use it in GitHub Desktop.
$ /usr/bin/time -l ./a.out test.txt
3000000 18000000 105000000 test.txt
0.17 real 0.14 user 0.03 sys
811008 maximum resident set size
0 average shared memory size
0 average unshared data size
0 average unshared stack size
207 page reclaims
0 page faults
0 swaps
0 block input operations
0 block output operations
0 messages sent
0 messages received
0 signals received
0 voluntary context switches
162 involuntary context switches
$ /usr/bin/time -l wc -mwl test.txt
3000000 18000000 105000000 test.txt
0.99 real 0.96 user 0.02 sys
1851392 maximum resident set size
0 average shared memory size
0 average unshared data size
0 average unshared stack size
462 page reclaims
1 page faults
0 swaps
0 block input operations
0 block output operations
0 messages sent
0 messages received
0 signals received
1 voluntary context switches
1335 involuntary context switches
// compile with: clang++ -std=c++17 -O3 wc.cpp
// Make sure that it's a recent Xcode's version of clang++!
#include <cstdio>
#include <fcntl.h>
#include <unistd.h>
constexpr size_t BUF_SIZE = 4 << 10; // 1 disk page, growing it up to L1 cache size doesn't help
constexpr size_t UNROLL_RATE = 8; // empirically the best for 2015-ish Intels
bool IsSpace(unsigned char c) {
// clang is actually REALLY good at optimizing these!
return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\v' || c == '\f';
}
int main(int argc, const char* argv[]) {
if (argc < 2) {
return 1;
}
size_t num_chars = 0;
size_t num_words = 0;
size_t num_lines = 0;
bool prev_space = true;
// only real "hand-optimization" is not using iostream,
// which is an overengineered mess, or fopen/getchar b/c inlining
int fd = open(argv[1], O_RDONLY);
if (fd == -1) {
return 1;
}
char buffer[BUF_SIZE]; // alignas(UNROLL_RATE) doesn't help because no vectorization
ssize_t size = BUF_SIZE;
// clang++ doesn't seem to mind the references as long as the function is inlined
auto count_char = [&num_chars, &num_words, &num_lines, &prev_space](unsigned c) {
if (IsSpace(c)) {
++num_chars;
prev_space = true;
if (c == '\n') {
++num_lines;
}
} else {
// look at the top 2 bits to determine if a UTF-8 continuation byte
if ((0b1100'0000 & c) != 0b1000'0000) {
++num_chars;
}
num_words += prev_space;
prev_space = false;
}
};
while (size) {
// I take some issue with kernel -> userspace copying, but there is no portable way to avoid it.
// mmap might help but I'm too lazy to try it, it might also hurt b/c L1 cache
size = read(fd, buffer, BUF_SIZE);
if (size == -1) {
return 1;
}
size_t good_size = (size / UNROLL_RATE) * UNROLL_RATE;
for (const char* it = buffer; it < buffer + good_size; it += UNROLL_RATE) {
// extra loop tells the compiler it's okay to unroll aggressively
for (const char* itt = it; itt < it + UNROLL_RATE; ++itt) {
count_char(*itt);
}
}
// you can do away with one loop at the meager price of 17% runtime
for (const char* it = buffer + good_size; it < buffer + size; ++it) {
count_char(*it);
}
}
close(fd);
printf("%zu %zu %zu %s\n", num_lines, num_words, num_chars, argv[1]);
}
// make it 80 lines of sorta optimized C++
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment