Created
November 13, 2011 08:42
-
-
Save karthick18/1361842 to your computer and use it in GitHub Desktop.
memcmp with SSE2 16 byte loads and compares
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* word compare with SSE2 functions/instructions for faster 16 byte loads and compares. | |
* | |
* compile with: | |
* gcc -o sse_load_test sse_load.c -Wall -O2 -g -DTEST_SSE2 | |
* | |
*/ | |
#if defined(TEST_SSE2) && !defined(__SSE2__) | |
#error "no SSE2 set" | |
#endif | |
#include <emmintrin.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <assert.h> | |
#ifdef TEST_SSE2 | |
#define __MEMCMP __sse_memcmp | |
static int __sse_memcmp_tail(const uint16_t *a, const uint16_t *b, int len) | |
{ | |
switch(len) | |
{ | |
case 8: | |
if(*a++ != *b++) return -1; | |
case 7: | |
if(*a++ != *b++) return -1; | |
case 6: | |
if(*a++ != *b++) return -1; | |
case 5: | |
if(*a++ != *b++) return -1; | |
case 4: | |
if(*a++ != *b++) return -1; | |
case 3: | |
if(*a++ != *b++) return -1; | |
case 2: | |
if(*a++ != *b++) return -1; | |
case 1: | |
if(*a != *b) return -1; | |
} | |
return 0; | |
} | |
static int __sse_memcmp(const uint16_t *a, const uint16_t *b, int half_words) | |
{ | |
int i = 0; | |
int len = half_words; | |
int aligned_a = 0, aligned_b = 0; | |
if(!len) return 0; | |
if(!a && !b) return 0; | |
if(!a || !b) return -1; | |
if( (unsigned long) a & 1 ) return -1; | |
if( (unsigned long) b & 1 ) return -1; | |
aligned_a = ( (unsigned long)a & (sizeof(__m128i)-1) ); | |
aligned_b = ( (unsigned long)b & (sizeof(__m128i)-1) ); | |
if(aligned_a != aligned_b) return -1; /* both has to be unaligned on the same boundary or aligned */ | |
if(aligned_a) | |
{ | |
while( len && | |
( (unsigned long) a & ( sizeof(__m128i)-1) ) ) | |
{ | |
if(*a++ != *b++) return -1; | |
--len; | |
} | |
} | |
if(!len) return 0; | |
while( len && !(len & 7 ) ) | |
{ | |
__m128i x = _mm_load_si128( (__m128i*)&a[i]); | |
__m128i y = _mm_load_si128( (__m128i*)&b[i]); | |
/* | |
* _mm_cmpeq_epi16 returns 0xffff for each of the 8 half words when it matches | |
*/ | |
__m128i cmp = _mm_cmpeq_epi16(x, y); | |
/* | |
* _mm_movemask_epi8 creates a 16 bit mask with the MSB for each of the 16 bytes of cmp | |
*/ | |
if ( (uint16_t)_mm_movemask_epi8(cmp) != 0xffffU) return -1; | |
len -= 8; | |
i += 8; | |
} | |
return __sse_memcmp_tail(&a[i], &b[i], len); | |
} | |
#else | |
#define __MEMCMP memcmp | |
#endif | |
static int test_sse_memcmp(int half_words, int randomize_factor) | |
{ | |
uint16_t *a = malloc(half_words * sizeof(*a)); | |
uint16_t *b = malloc(half_words * sizeof(*b)); | |
int cmp; | |
register int i; | |
assert(a && b); | |
for(i = 0; i < half_words; ++i) | |
{ | |
a[i] = random() & ( (1<<24)-1); | |
b[i] = random() & ( (1<<24)-1); | |
if(!randomize_factor || (i & randomize_factor)) a[i] = b[i]; | |
} | |
cmp = __MEMCMP(a, b, half_words); | |
free(a); free(b); | |
return cmp; | |
} | |
int main(int argc, char **argv) | |
{ | |
__attribute__((aligned(16))) const uint16_t val[8] = { [ 0 ... 7] = 1 }; | |
union { __m128i ret; uint16_t ret_s[8];} v; | |
register int i; | |
int randomize_factor = 0; | |
int half_words = 0; | |
/* | |
* warm up | |
*/ | |
v.ret = _mm_load_si128((__m128i*)val); | |
for(i = 0; i < sizeof(val)/sizeof(val[0]); ++i) | |
if(v.ret_s[i] != val[i]) printf("val at index [%d] doesn't match expected [%d]\n", v.ret_s[i], val[i]); | |
if(argc > 1) half_words = atoi(argv[1]); | |
if(argc > 2) randomize_factor = atoi(argv[2]); | |
if(!half_words) half_words = 10; | |
printf("memcmp test result [%d]\n", test_sse_memcmp(half_words, randomize_factor)); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment