Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Simple AVX2-enabled version of ASCII-only wc util
/*
* zlib License, see https://en.wikipedia.org/wiki/Zlib_License
*
* Copyright (c) 2020 Leonid Yuriev <leo@yuriev.ru>,
* Simple portable optimized ASCII-only version of wc tool.
* Just for fun &
* https://en.wikipedia.org/wiki/The_Computer_Language_Benchmarks_Game
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgement in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
*/
#include <errno.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <time.h>
#include <unistd.h>
#include <x86intrin.h>
static struct { size_t chars, words, lines; } result;
static unsigned process_chunk(const unsigned char *text, const size_t bytes,
_Bool was_space) {
result.chars += bytes;
/* This loop was copied from the https://habr.com/ru/post/489898/ article
* with minor changes */
for (size_t i = 0; i < bytes; i += 32) {
_mm_prefetch(text + i + 32 * 64, _MM_HINT_NTA);
__m256i const c = _mm256_load_si256((__m256i *)(text + i));
uint32_t is_newline_mask =
_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, _mm256_set1_epi8('\n')));
uint32_t is_space_mask = _mm256_movemask_epi8(
_mm256_andnot_si256(c, _mm256_sub_epi8(c, _mm256_set1_epi8(' ' + 1))));
uint32_t was_space_mask = (was_space ? 1 : 0) | (is_space_mask << 1);
result.lines += _mm_popcnt_u32(is_newline_mask);
result.words += _mm_popcnt_u32(is_space_mask & ~was_space_mask);
was_space = is_space_mask & 0x80000000;
}
return was_space;
}
/******************************************************************************/
static int process_fd(int fd) {
off_t length = lseek(fd, 0, SEEK_END);
if (length >= 0 && length <= INTPTR_MAX) {
void *ptr = mmap(NULL, (size_t)length, PROT_READ, MAP_PRIVATE, fd, 0);
if (ptr == MAP_FAILED) {
perror("mmap");
return EXIT_FAILURE;
}
process_chunk((const unsigned char *)ptr, (size_t)length, 0);
munmap(ptr, length);
return EXIT_SUCCESS;
}
if (length < 0 && errno != ESPIPE) {
perror("leeek");
return EXIT_FAILURE;
}
_Bool state = 0;
ssize_t chunk;
for (;;) {
unsigned char buf[65536];
chunk = read(fd, buf, sizeof(buf));
if (chunk < 1)
break;
if (chunk < (int)sizeof(buf))
memset(buf + chunk, ' ', (32 - chunk) & 31);
state = process_chunk(buf, chunk, state);
}
if (chunk < 0) {
perror("read");
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
int main(int argc, const char *argv[]) {
int rc = 42;
if (argc > 1) {
for (int i = 1; i < argc; ++i) {
int fd =
(strcmp(argv[i], "-") == 0) ? STDIN_FILENO : open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return EXIT_FAILURE;
}
rc = process_fd(fd);
close(fd);
if (rc != EXIT_SUCCESS)
break;
}
} else
rc = process_fd(STDIN_FILENO);
struct timespec ts = {0, 0};
if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts))
perror("clock_gettime(CLOCK_PROCESS_CPUTIME_ID)");
if (rc == EXIT_SUCCESS)
printf("lines %zu, words %zu, chars %zu\ntook %.6f seconds\n", result.lines,
result.words, result.chars, ts.tv_nsec * 1e-9 + ts.tv_sec);
return rc;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.