Last active
March 3, 2020 15:27
-
-
Save erthink/a93124374bb7cb8aa9dc731ed2cfdf73 to your computer and use it in GitHub Desktop.
Simple portable optimized ASCII-only version of wc util
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* zlib License, see https://en.wikipedia.org/wiki/Zlib_License | |
* | |
* Copyright (c) 2020 Leonid Yuriev <leo@yuriev.ru>, | |
* Simple portable optimized ASCII-only version of wc tool. | |
* Just for fun & https://en.wikipedia.org/wiki/The_Computer_Language_Benchmarks_Game | |
* | |
* This software is provided 'as-is', without any express or implied | |
* warranty. In no event will the authors be held liable for any damages | |
* arising from the use of this software. | |
* | |
* Permission is granted to anyone to use this software for any purpose, | |
* including commercial applications, and to alter it and redistribute it | |
* freely, subject to the following restrictions: | |
* | |
* 1. The origin of this software must not be misrepresented; you must not | |
* claim that you wrote the original software. If you use this software | |
* in a product, an acknowledgement in the product documentation would be | |
* appreciated but is not required. | |
* 2. Altered source versions must be plainly marked as such, and must not be | |
* misrepresented as being the original software. | |
* 3. This notice may not be removed or altered from any source distribution. | |
* | |
*/ | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sys/mman.h> | |
#include <time.h> | |
#include <unistd.h> | |
static struct { size_t chars, words, lines; } result; | |
typedef struct { | |
uint16_t m[4]; | |
} hyper_t; | |
static hyper_t sym2hyper[65536]; | |
static uint8_t hyper2value[512]; | |
static void build_hyper(void) { | |
for (unsigned i = 0; i < 65536; ++i) { | |
unsigned char c[2]; | |
memcpy(c, &i, 2); | |
const unsigned ln = (c[0] == '\n') + (c[1] == '\n'); | |
const _Bool w0 = (c[0] != ' ') && (c[0] - 9 > 4); | |
const _Bool w1 = (c[1] != ' ') && (c[1] - 9 > 4); | |
sym2hyper[i].m[0] = ln + (w0 << 5) + (w1 << 6); | |
sym2hyper[i].m[1] = ln + (w0 << 7) + (w1 << 8); | |
sym2hyper[i].m[2] = ln + (w0 << 9) + (w1 << 10); | |
sym2hyper[i].m[3] = ln + (w0 << 11) + (w1 << 12); | |
} | |
for (unsigned i = 0; i < 512; ++i) { | |
_Bool prev = i & 1; | |
for (unsigned mask = i >> 1; mask > 0; mask >>= 1) { | |
hyper2value[i] += (mask & 1) && !prev; | |
prev = mask & 1; | |
} | |
} | |
} | |
static unsigned map(const unsigned char *text, size_t i, size_t n) { | |
return sym2hyper[*(uint16_t *)(text + i + n * 2)].m[n]; | |
} | |
static unsigned reduce(unsigned prev, unsigned hyper_a, unsigned hyper_b, | |
unsigned hyper_c, unsigned hyper_d) { | |
return prev + hyper_a + hyper_b + hyper_c + hyper_d; | |
} | |
static unsigned apply(unsigned hyper) { | |
result.lines += hyper & 15; | |
result.words += hyper2value[hyper >> 4]; | |
return (hyper >> 8) & 16; | |
} | |
static unsigned process_chunk(const unsigned char *text, const size_t bytes, | |
unsigned prev) { | |
result.chars += bytes; | |
for (size_t i = 0; i < bytes; i += 8) | |
prev = apply(reduce(prev, map(text, i, 0), map(text, i, 1), map(text, i, 2), | |
map(text, i, 3))); | |
return prev; | |
} | |
/******************************************************************************/ | |
static int process_fd(int fd) { | |
off_t length = lseek(fd, 0, SEEK_END); | |
if (length >= 0 && length <= INTPTR_MAX) { | |
void *ptr = mmap(NULL, (size_t)length, PROT_READ, MAP_PRIVATE, fd, 0); | |
if (ptr == MAP_FAILED) { | |
perror("mmap"); | |
return EXIT_FAILURE; | |
} | |
process_chunk((const unsigned char *)ptr, (size_t)length, 0); | |
munmap(ptr, length); | |
return EXIT_SUCCESS; | |
} | |
if (length < 0 && errno != ESPIPE) { | |
perror("leeek"); | |
return EXIT_FAILURE; | |
} | |
_Bool state = 0; | |
ssize_t chunk; | |
for (;;) { | |
unsigned char buf[65536]; | |
chunk = read(fd, buf, sizeof(buf)); | |
if (chunk < 1) | |
break; | |
if (chunk < (int)sizeof(buf)) | |
memset(buf + chunk, ' ', (8 - chunk) & 7); | |
state = process_chunk(buf, chunk, state); | |
} | |
if (chunk < 0) { | |
perror("read"); | |
return EXIT_FAILURE; | |
} | |
return EXIT_SUCCESS; | |
} | |
int main(int argc, const char *argv[]) { | |
build_hyper(); | |
int rc = 42; | |
if (argc > 1) { | |
for (int i = 1; i < argc; ++i) { | |
int fd = | |
(strcmp(argv[i], "-") == 0) ? STDIN_FILENO : open(argv[1], O_RDONLY); | |
if (fd < 0) { | |
perror("open"); | |
return EXIT_FAILURE; | |
} | |
rc = process_fd(fd); | |
close(fd); | |
if (rc != EXIT_SUCCESS) | |
break; | |
} | |
} else | |
rc = process_fd(STDIN_FILENO); | |
struct timespec ts = {0, 0}; | |
if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts)) | |
perror("clock_gettime(CLOCK_PROCESS_CPUTIME_ID)"); | |
if (rc == EXIT_SUCCESS) | |
printf("lines %zu, words %zu, chars %zu\ntook %.6f seconds\n", result.lines, | |
result.words, result.chars, ts.tv_nsec * 1e-9 + ts.tv_sec); | |
return rc; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment