Create a gist now

Instantly share code, notes, and snippets.

@vstakhov /nlines.c
Last active Dec 17, 2015

What would you like to do?
Measure performance of different algorithms to match lines count in a text.
#include <sys/time.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>
#ifdef __SSE__
#include <immintrin.h>
#endif
#include <limits.h>
#define CLOCK_TYPE CLOCK_PROCESS_CPUTIME_ID
double
get_ticks (void)
{
double res;
#ifdef __APPLE__
res = mach_absolute_time () / 1000000000.;
#else
struct timespec ts;
clock_gettime (CLOCK_TYPE, &ts);
res = (double)ts.tv_sec + ts.tv_nsec / 1000000000.;
#endif
return res;
}
#define SS (sizeof(size_t))
#define ALIGN (sizeof(size_t) - 1)
#define ONES ((size_t)-1 / UCHAR_MAX)
#define HIGHS (ONES * (UCHAR_MAX / 2 + 1))
#define HASZERO(x) ((x) - ONES & ~(x) & HIGHS)
void *
musl_memchr (const void *src, int c, size_t n)
{
const unsigned char *s = src;
c = (unsigned char)c;
for (; ((uintptr_t)s & ALIGN) && n && *s != c; s++, n--) ;
if (n && *s != c) {
const size_t *w;
size_t k = ONES * c;
for (w = (const void *)s; n>=SS && !HASZERO (*w ^ k); w++, n -= SS) ;
for (s = (const void *)w; n && *s != c; s++, n--) ;
}
return n ? (void *)s : 0;
}
static const unsigned char BitsSetTable256[256] =
{
# define B2(n) n, n + 1, n + 1, n + 2
# define B4(n) B2 (n), B2 (n + 1), B2 (n + 1), B2 (n + 2)
# define B6(n) B4 (n), B4 (n + 1), B4 (n + 1), B4 (n + 2)
B6 (0), B6 (1), B6 (1), B6 (2)
};
int
main (int argc, char **argv)
{
struct timespec ts;
double t1, t2;
unsigned char *buf = NULL, *p;
size_t size = 0, r = 0, i, cnt, rm, ccnt;
FILE *in = stdin;
uint64_t *ip, mask, magic = ((uint64_t) 0x7efefefe << 32) | 0xfefefeff, tst;
double *dp;
int64_t inv = -1;
posix_memalign (&buf, 32, BUFSIZ);
size = BUFSIZ;
while (!feof (in) && !ferror (in)) {
if (r == size) {
posix_memalign (&p, 32, size * 2);
memcpy (p, buf, size);
buf = p;
size *= 2;
}
r += fread (buf + r, 1, size - r, in);
}
if (ferror (in)) {
fprintf (stderr, "Failed to read the input file.\n");
exit (EXIT_FAILURE);
}
fclose (in);
p = buf;
cnt = 0;
t1 = get_ticks ();
for (i = 0; i < r; i++) {
if (p[i] == '\n') {
cnt++;
}
}
t2 = get_ticks ();
printf ("Naive: %zd %.6f\n", cnt, t2 - t1);
p = buf;
cnt = 0;
t1 = get_ticks ();
for (i = 0; i < r; i++) {
if ((p[i] ^ 10) == 0) {
cnt++;
}
}
t2 = get_ticks ();
printf ("Stupid xor: %zd %.6f\n", cnt, t2 - t1);
p = buf;
cnt = 0;
t1 = get_ticks ();
do {
p = memchr (p, '\n', r - (p - buf));
} while (p != NULL && (++cnt), p++);
t2 = get_ticks ();
printf ("Memchr: %zd %.6f\n", cnt, t2 - t1);
p = buf;
cnt = 0;
t1 = get_ticks ();
do {
p = musl_memchr (p, '\n', r - (p - buf));
} while (p != NULL && (++cnt), p++);
t2 = get_ticks ();
printf ("Memchr (musl): %zd %.6f\n", cnt, t2 - t1);
p = buf;
cnt = 0;
ip = (uint64_t *)p;
rm = r;
mask = 0x0A0A0A0A0A0A0A0AULL;
t1 = get_ticks ();
while (rm >= sizeof (*ip)) {
tst = *ip++ ^ mask;
if ((((tst + magic) ^ ~tst) & ~magic) != 0) {
p = (unsigned char *)(ip - 1);
for (i = 0; i < sizeof(*ip); i++) {
if ((p[i] ^ 10) == 0) {
cnt++;
}
}
}
rm -= sizeof(*ip);
}
p = (unsigned char *)(ip);
for (i = 0; i < rm; i++) {
if (p[i] == '\n') {
cnt++;
}
}
t2 = get_ticks ();
printf ("Magic bits: %zd %.6f\n", cnt, t2 - t1);
#ifdef __SSE__
__m128i xmm0, xmm1, xmm2;
__m128i xmm3, xmm4, xmm5,xmm6,xmm7;
p = buf;
ip = (uint64_t *)p;
cnt = 0;
ccnt = 0;
rm = r;
xmm2 = _mm_set1_epi8 ('\n');
t1 = get_ticks ();
while (rm >= sizeof (xmm0)) {
uint32_t res;
xmm0 = _mm_load_si128 ((const __m128i *)ip); // tst
xmm1 = _mm_cmpeq_epi8 (xmm0, xmm2);
res = _mm_movemask_epi8 (xmm1);
p = (unsigned char *)&res;
cnt += BitsSetTable256[p[0]] + BitsSetTable256[p[1]] +
BitsSetTable256[p[2]] + BitsSetTable256[p[3]];
ip += sizeof(xmm0) / sizeof(*ip);
rm -= sizeof(xmm0);
}
p = (unsigned char *)(ip);
for (i = 0; i < rm; i++) {
if (p[i] == '\n') {
cnt++;
}
}
t2 = get_ticks ();
printf ("SSE: %zd %.6f\n", cnt, t2 - t1);
#endif
#ifdef __SSE__
t1 = get_ticks ();
p = buf;
ip = (uint64_t *)p;
xmm2 = _mm_set1_epi8 ('\n');
xmm3 = _mm_set1_epi8 (1);
xmm6 = _mm_setzero_si128 ();
xmm7 = xmm6;
/* First part. */
if (r>=960)
{
uint64_t *ipe1 = ip + ((r - (size_t)952) >> 3);
do
{
__m128i xmm10,xmm11,xmm14,xmm15;
xmm4 = xmm7;
xmm14 = xmm7;
uint64_t *ipe2 = ip + 120;
do
{
xmm0 = _mm_load_si128 ((const __m128i *)ip); ip += 2;
xmm10 = _mm_load_si128 ((const __m128i *)ip); ip += 2;
xmm1 = _mm_cmpeq_epi8 ( xmm0, xmm2);
xmm11 = _mm_cmpeq_epi8 (xmm10, xmm2);
xmm5 = _mm_min_epu8 ( xmm1, xmm3);
xmm15 = _mm_min_epu8 (xmm11, xmm3);
xmm4 = _mm_adds_epu8 ( xmm4,xmm5);
xmm14 = _mm_adds_epu8 (xmm14,xmm15);
}
while (ip < ipe2);
xmm1 = _mm_sad_epu8 ( xmm4,xmm7);
xmm11 = _mm_sad_epu8 (xmm14,xmm7);
xmm6 = _mm_add_epi64 (xmm6, xmm1);
xmm6 = _mm_add_epi64 (xmm6,xmm11);
}
while (ip < ipe1);
}
/* Second part. */
do
{
xmm4 = xmm7;
uint64_t *ipe2 = (uint64_t *)(p + ((r >> 4) << 4));
for (; ip<ipe2; ip += 2)
{
xmm0 = _mm_load_si128 ((const __m128i *)ip);
xmm1 = _mm_cmpeq_epi8 (xmm0, xmm2);
xmm5 = _mm_min_epu8 (xmm1, xmm3);
xmm4 = _mm_adds_epu8 (xmm4,xmm5);
}
xmm1 = _mm_sad_epu8 (xmm4,xmm7);
xmm6 = _mm_add_epi64 (xmm6,xmm1);
cnt = (size_t)_mm_cvtsi128_si64 (xmm6);
cnt += (size_t)_mm_cvtsi128_si64 (
(_mm_castpd_si128 (
_mm_shuffle_pd (_mm_castsi128_pd (xmm6),
_mm_castsi128_pd (xmm6), 1))));
}
while (false);
/* Last part. */
do {
unsigned char *pe = p + r;
p = (unsigned char *)(ip);
for (; p<pe; ++p) if (*p=='\n') ++cnt;
} while (false);
t2 = get_ticks ();
printf ("SSE2: %zd %.6f\n", cnt, t2 - t1);
#endif
#ifdef __AVX2__
__m256i ymm0, ymm1, ymm2, ymm3;
p = buf;
ip = (uint64_t *)p;
cnt = 0;
ccnt = 0;
rm = r;
xmm2 = _mm_set1_epi8 ('\n');
ymm2 = _mm256_broadcastsi128_si256 (xmm2);
xmm2 = _mm_set1_epi8 (-1);
ymm3 = _mm256_broadcastsi128_si256 (xmm2);
t1 = get_ticks ();
while (rm >= sizeof (ymm0)) {
uint32_t res;
//ymm0 = _mm256_stream_load_si256 ((const __m256i *)ip); // tst
ymm0 = _mm256_maskload_epi32 (ip, ymm3);
ymm1 = _mm256_cmpeq_epi8 (ymm0, ymm2);
res = _mm256_movemask_epi8 (ymm1);
p = (unsigned char *)&res;
cnt += BitsSetTable256[p[0]] + BitsSetTable256[p[1]] +
BitsSetTable256[p[2]] + BitsSetTable256[p[3]];
ip += sizeof(ymm0) / sizeof(*ip);
rm -= sizeof(ymm0);
}
p = (unsigned char *)(ip);
for (i = 0; i < rm; i++) {
if (p[i] == '\n') {
cnt++;
}
}
t2 = get_ticks ();
printf ("AVX2: %zd %.6f\n", cnt, t2 - t1);
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment