Last active
December 17, 2015 00:18
-
-
Save vstakhov/99c04d6aa18e47c7950d to your computer and use it in GitHub Desktop.
Measure performance of different algorithms to match lines count in a text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <sys/time.h> | |
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <sys/types.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <stdbool.h> | |
#ifdef __SSE__ | |
#include <immintrin.h> | |
#endif | |
#include <limits.h> | |
#define CLOCK_TYPE CLOCK_PROCESS_CPUTIME_ID | |
double | |
get_ticks (void) | |
{ | |
double res; | |
#ifdef __APPLE__ | |
res = mach_absolute_time () / 1000000000.; | |
#else | |
struct timespec ts; | |
clock_gettime (CLOCK_TYPE, &ts); | |
res = (double)ts.tv_sec + ts.tv_nsec / 1000000000.; | |
#endif | |
return res; | |
} | |
#define SS (sizeof(size_t)) | |
#define ALIGN (sizeof(size_t) - 1) | |
#define ONES ((size_t)-1 / UCHAR_MAX) | |
#define HIGHS (ONES * (UCHAR_MAX / 2 + 1)) | |
#define HASZERO(x) ((x) - ONES & ~(x) & HIGHS) | |
void * | |
musl_memchr (const void *src, int c, size_t n) | |
{ | |
const unsigned char *s = src; | |
c = (unsigned char)c; | |
for (; ((uintptr_t)s & ALIGN) && n && *s != c; s++, n--) ; | |
if (n && *s != c) { | |
const size_t *w; | |
size_t k = ONES * c; | |
for (w = (const void *)s; n>=SS && !HASZERO (*w ^ k); w++, n -= SS) ; | |
for (s = (const void *)w; n && *s != c; s++, n--) ; | |
} | |
return n ? (void *)s : 0; | |
} | |
static const unsigned char BitsSetTable256[256] = | |
{ | |
# define B2(n) n, n + 1, n + 1, n + 2 | |
# define B4(n) B2 (n), B2 (n + 1), B2 (n + 1), B2 (n + 2) | |
# define B6(n) B4 (n), B4 (n + 1), B4 (n + 1), B4 (n + 2) | |
B6 (0), B6 (1), B6 (1), B6 (2) | |
}; | |
int | |
main (int argc, char **argv) | |
{ | |
struct timespec ts; | |
double t1, t2; | |
unsigned char *buf = NULL, *p; | |
size_t size = 0, r = 0, i, cnt, rm, ccnt; | |
FILE *in = stdin; | |
uint64_t *ip, mask, magic = ((uint64_t) 0x7efefefe << 32) | 0xfefefeff, tst; | |
double *dp; | |
int64_t inv = -1; | |
posix_memalign (&buf, 32, BUFSIZ); | |
size = BUFSIZ; | |
while (!feof (in) && !ferror (in)) { | |
if (r == size) { | |
posix_memalign (&p, 32, size * 2); | |
memcpy (p, buf, size); | |
buf = p; | |
size *= 2; | |
} | |
r += fread (buf + r, 1, size - r, in); | |
} | |
if (ferror (in)) { | |
fprintf (stderr, "Failed to read the input file.\n"); | |
exit (EXIT_FAILURE); | |
} | |
fclose (in); | |
p = buf; | |
cnt = 0; | |
t1 = get_ticks (); | |
for (i = 0; i < r; i++) { | |
if (p[i] == '\n') { | |
cnt++; | |
} | |
} | |
t2 = get_ticks (); | |
printf ("Naive: %zd %.6f\n", cnt, t2 - t1); | |
p = buf; | |
cnt = 0; | |
t1 = get_ticks (); | |
for (i = 0; i < r; i++) { | |
if ((p[i] ^ 10) == 0) { | |
cnt++; | |
} | |
} | |
t2 = get_ticks (); | |
printf ("Stupid xor: %zd %.6f\n", cnt, t2 - t1); | |
p = buf; | |
cnt = 0; | |
t1 = get_ticks (); | |
do { | |
p = memchr (p, '\n', r - (p - buf)); | |
} while (p != NULL && (++cnt), p++); | |
t2 = get_ticks (); | |
printf ("Memchr: %zd %.6f\n", cnt, t2 - t1); | |
p = buf; | |
cnt = 0; | |
t1 = get_ticks (); | |
do { | |
p = musl_memchr (p, '\n', r - (p - buf)); | |
} while (p != NULL && (++cnt), p++); | |
t2 = get_ticks (); | |
printf ("Memchr (musl): %zd %.6f\n", cnt, t2 - t1); | |
p = buf; | |
cnt = 0; | |
ip = (uint64_t *)p; | |
rm = r; | |
mask = 0x0A0A0A0A0A0A0A0AULL; | |
t1 = get_ticks (); | |
while (rm >= sizeof (*ip)) { | |
tst = *ip++ ^ mask; | |
if ((((tst + magic) ^ ~tst) & ~magic) != 0) { | |
p = (unsigned char *)(ip - 1); | |
for (i = 0; i < sizeof(*ip); i++) { | |
if ((p[i] ^ 10) == 0) { | |
cnt++; | |
} | |
} | |
} | |
rm -= sizeof(*ip); | |
} | |
p = (unsigned char *)(ip); | |
for (i = 0; i < rm; i++) { | |
if (p[i] == '\n') { | |
cnt++; | |
} | |
} | |
t2 = get_ticks (); | |
printf ("Magic bits: %zd %.6f\n", cnt, t2 - t1); | |
#ifdef __SSE__ | |
__m128i xmm0, xmm1, xmm2; | |
__m128i xmm3, xmm4, xmm5,xmm6,xmm7; | |
p = buf; | |
ip = (uint64_t *)p; | |
cnt = 0; | |
ccnt = 0; | |
rm = r; | |
xmm2 = _mm_set1_epi8 ('\n'); | |
t1 = get_ticks (); | |
while (rm >= sizeof (xmm0)) { | |
uint32_t res; | |
xmm0 = _mm_load_si128 ((const __m128i *)ip); // tst | |
xmm1 = _mm_cmpeq_epi8 (xmm0, xmm2); | |
res = _mm_movemask_epi8 (xmm1); | |
p = (unsigned char *)&res; | |
cnt += BitsSetTable256[p[0]] + BitsSetTable256[p[1]] + | |
BitsSetTable256[p[2]] + BitsSetTable256[p[3]]; | |
ip += sizeof(xmm0) / sizeof(*ip); | |
rm -= sizeof(xmm0); | |
} | |
p = (unsigned char *)(ip); | |
for (i = 0; i < rm; i++) { | |
if (p[i] == '\n') { | |
cnt++; | |
} | |
} | |
t2 = get_ticks (); | |
printf ("SSE: %zd %.6f\n", cnt, t2 - t1); | |
#endif | |
#ifdef __SSE__ | |
t1 = get_ticks (); | |
p = buf; | |
ip = (uint64_t *)p; | |
xmm2 = _mm_set1_epi8 ('\n'); | |
xmm3 = _mm_set1_epi8 (1); | |
xmm6 = _mm_setzero_si128 (); | |
xmm7 = xmm6; | |
/* First part. */ | |
if (r>=960) | |
{ | |
uint64_t *ipe1 = ip + ((r - (size_t)952) >> 3); | |
do | |
{ | |
__m128i xmm10,xmm11,xmm14,xmm15; | |
xmm4 = xmm7; | |
xmm14 = xmm7; | |
uint64_t *ipe2 = ip + 120; | |
do | |
{ | |
xmm0 = _mm_load_si128 ((const __m128i *)ip); ip += 2; | |
xmm10 = _mm_load_si128 ((const __m128i *)ip); ip += 2; | |
xmm1 = _mm_cmpeq_epi8 ( xmm0, xmm2); | |
xmm11 = _mm_cmpeq_epi8 (xmm10, xmm2); | |
xmm5 = _mm_min_epu8 ( xmm1, xmm3); | |
xmm15 = _mm_min_epu8 (xmm11, xmm3); | |
xmm4 = _mm_adds_epu8 ( xmm4,xmm5); | |
xmm14 = _mm_adds_epu8 (xmm14,xmm15); | |
} | |
while (ip < ipe2); | |
xmm1 = _mm_sad_epu8 ( xmm4,xmm7); | |
xmm11 = _mm_sad_epu8 (xmm14,xmm7); | |
xmm6 = _mm_add_epi64 (xmm6, xmm1); | |
xmm6 = _mm_add_epi64 (xmm6,xmm11); | |
} | |
while (ip < ipe1); | |
} | |
/* Second part. */ | |
do | |
{ | |
xmm4 = xmm7; | |
uint64_t *ipe2 = (uint64_t *)(p + ((r >> 4) << 4)); | |
for (; ip<ipe2; ip += 2) | |
{ | |
xmm0 = _mm_load_si128 ((const __m128i *)ip); | |
xmm1 = _mm_cmpeq_epi8 (xmm0, xmm2); | |
xmm5 = _mm_min_epu8 (xmm1, xmm3); | |
xmm4 = _mm_adds_epu8 (xmm4,xmm5); | |
} | |
xmm1 = _mm_sad_epu8 (xmm4,xmm7); | |
xmm6 = _mm_add_epi64 (xmm6,xmm1); | |
cnt = (size_t)_mm_cvtsi128_si64 (xmm6); | |
cnt += (size_t)_mm_cvtsi128_si64 ( | |
(_mm_castpd_si128 ( | |
_mm_shuffle_pd (_mm_castsi128_pd (xmm6), | |
_mm_castsi128_pd (xmm6), 1)))); | |
} | |
while (false); | |
/* Last part. */ | |
do { | |
unsigned char *pe = p + r; | |
p = (unsigned char *)(ip); | |
for (; p<pe; ++p) if (*p=='\n') ++cnt; | |
} while (false); | |
t2 = get_ticks (); | |
printf ("SSE2: %zd %.6f\n", cnt, t2 - t1); | |
#endif | |
#ifdef __AVX2__ | |
__m256i ymm0, ymm1, ymm2, ymm3; | |
p = buf; | |
ip = (uint64_t *)p; | |
cnt = 0; | |
ccnt = 0; | |
rm = r; | |
xmm2 = _mm_set1_epi8 ('\n'); | |
ymm2 = _mm256_broadcastsi128_si256 (xmm2); | |
xmm2 = _mm_set1_epi8 (-1); | |
ymm3 = _mm256_broadcastsi128_si256 (xmm2); | |
t1 = get_ticks (); | |
while (rm >= sizeof (ymm0)) { | |
uint32_t res; | |
//ymm0 = _mm256_stream_load_si256 ((const __m256i *)ip); // tst | |
ymm0 = _mm256_maskload_epi32 (ip, ymm3); | |
ymm1 = _mm256_cmpeq_epi8 (ymm0, ymm2); | |
res = _mm256_movemask_epi8 (ymm1); | |
p = (unsigned char *)&res; | |
cnt += BitsSetTable256[p[0]] + BitsSetTable256[p[1]] + | |
BitsSetTable256[p[2]] + BitsSetTable256[p[3]]; | |
ip += sizeof(ymm0) / sizeof(*ip); | |
rm -= sizeof(ymm0); | |
} | |
p = (unsigned char *)(ip); | |
for (i = 0; i < rm; i++) { | |
if (p[i] == '\n') { | |
cnt++; | |
} | |
} | |
t2 = get_ticks (); | |
printf ("AVX2: %zd %.6f\n", cnt, t2 - t1); | |
#endif | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment