pcordes/simd-strtoupper.c

## simd-strtoupper.c
// x86 SIMD string to uppercase
// See http://stackoverflow.com/questions/735204/convert-a-string-in-c-to-upper-case

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>  // for ffs
#include <ctype.h>
#include <immintrin.h>


/////// Timing results from a 2.4GHz Core2duo E6600 (Conroe/Merom)
/////// -march=native includes SSSE3

// clang-3.8 -march=native -Wall -Wextra -O3 simd-flipcase.c test-flipcase.c -DNOINLINE_INTO_MAIN -DSTRTOUPPER=strtoupper_autovec -o flipcase-clang-autovec


// Don't worry, the IACA stuff expands to nothing unless specifically enabled
// gcc -DIACA_MARKS -march=native -Wall -Wextra -O3 simd-flipcase.c
#ifdef IACA_MARKS
//#include </opt/iaca-2.1/include/iacaMarks.h>
#define IACA_SSC_MARK( MARK_ID )                                                \
__asm__ __volatile__ (                                                          \
                                          "\n\t  movl $"#MARK_ID", %%ebx"       \
                                          "\n\t  .byte 0x64, 0x67, 0x90"        \
                                          : : : /* "memory" */ );
#define IACA_START  /*IACA_UD_BYTES */  IACA_SSC_MARK(111)
#define IACA_END IACA_SSC_MARK(222) // IACA_UD_BYTES
#else
#define IACA_START
#define IACA_END
#endif

//#define _mm_storeu_si128  _mm_store_si128

// upcase all alphabetic ASCII bytes in a src vector
//inline static
__m128i upcase_si128(__m128i src) {
    /*
     * SSE only has a signed compare-greater, but we can still use the "unsigned
     * compare" trick by range-shifting to the bottom of the signed range
     * subtract 'a'+128, so the alphabetic characters range from -128 to -128+25 (-128+'z'-'a')
     *
     * note that adding 128 and subtracting 128 are the same thing for 8bit integers.
     * There's nowhere for the carry to go, so it's just xor (carryless add), flipping the high bit
     */

    __m128i rangeshift = _mm_sub_epi8(src, _mm_set1_epi8('a'+128));
    __m128i nomodify   = _mm_cmpgt_epi8(rangeshift, _mm_set1_epi8(-128 + 25));  // 0:lower case   -1:anything else (upper case or non-alphabetic).  25 = 'z' - 'a'

    __m128i flip  = _mm_andnot_si128(nomodify, _mm_set1_epi8(0x20));            // 0x20:lcase    0:non-lcase

    // just mask the XOR-mask so elements are XORed with 0 instead of 0x20
    // XOR's identity value is 0, same as addition's
    return          _mm_xor_si128(src, flip);
    // it's easier to xor with 0x20 or 0 than to AND with ~0x20 or 0xFF
}


// define our own because this function is intentionally not locale-aware
// char arg and return value result in clang actually wasting an insn sign-extending.  But when gcc auto-vectorizes with this, it unpacks/repacks to 4B elements
int ascii_toupper(int c) {
    return ('a' <= c && c <= 'z') ? c-0x20 : c;    // - lets the compiler use LEA
}


// TODO: detect non-ASCII (e.g. bytes > 0x7F) and fall back to UTF-8-aware scalar code to handle multibyte variable-size characters.
// toupper can map an ASCII char to a non-ASCII char (e.g. Turkish i -> İ, not I)
// PCMPISTRI can check for that and the terminator at the same time, and is only 3 uops on SnB-family CPUs

// convert to uppercase and return strlen
// works in-place if dst = src, or as a copy-and-modify
// if (dst != src), they must be at least 16B apart

// Merom:
// 40M iterations, separate destbuf, non-bloated cleanup (#if 1)
//  15 char cmdline string. gcc 5.2 native:  inlined: 1.24s.  Not inlined: 1.29s
//  15 char cmdline string. clang3.8 native  inlined: 1.04s.  Not inlined: 1.17s
// 16 char cmdline string. gcc 5.2 native:  inlined: 0.270s.  Not inlined: 0.335s
// 16 char cmdline string. clang3.8 native  inlined: 0.216.   Not inlined: 0.357s
//  17 char cmdline string. gcc 5.2 native:  inlined: 0.399s.  Not inlined: 0.48s
//  17 char cmdline string. clang3.8 native  inlined: 0.383s.  Not inlined: 0.45s

//  31 char cmdline string. gcc 5.2 native:  inlined: 0.391s.  Not inlined: 0.479s
//  31 char cmdline string. clang3.8 native  inlined: 0.393s.  Not inlined: 0.453s


// 127 char cmdline str    gcc 5.2 native:  inlined: 0.986s.  Not inlined: 0.925s
// 127 char cmdline str    clang3.8 native  inlined: 0.822s.  Not inlined: 0.888s
//  128 char cmdline str    gcc 5.2 native:  inlined: 0.885s.  Not inlined: 0.931s
//  128 char cmdline str    clang3.8 native  inlined: 0.739.   Not inlined: 0.829s
// 129 char cmdline str    gcc 5.2 native:  inlined: 0.966s   Not inlined: 1.02s
// 129 char cmdline str    clang3.8 native  inlined: 1.17s.   Not inlined: 1.23s

// 135 char cmdline str    gcc 5.2 native:  inlined: 0.964s  Not inlined: 1.005s
// 135 char cmdline str    clang3.8 native  inlined: 0.905s  Not inlined: 0.966s


//__attribute__((noinline))
size_t strtoupper_sse2(char *dst, const char *src_begin) {
    const char *src = src_begin;
    // scalar until the src pointer is aligned
    while ( (0xf & (uintptr_t)src) && *src ) {
        IACA_START
        *(dst++) = ascii_toupper(*(src++));
    }
    IACA_END

    if (!*src)
        return src - src_begin;

    // current position (p) is now 16B-aligned, and we're not at the end
    int zero_positions;
    IACA_START
    do {
        __m128i sv = _mm_load_si128( (const __m128i*)src );
        // TODO: SSE4.2 PCMPISTRI or PCMPISTRM version to combine the lower-case and '\0' detection?

        __m128i nullcheck = _mm_cmpeq_epi8(_mm_setzero_si128(), sv);
        zero_positions = _mm_movemask_epi8(nullcheck);
        // TODO: unroll so the null-byte check takes less overhead
        if (zero_positions)
            break;

        __m128i upcased = upcase_si128(sv);   // doing this before the loop break lets gcc realize that the constants are still in registers for the unaligned cleanup version.  But it leads to more wasted insns in the early-out case

        _mm_storeu_si128((__m128i*)dst, upcased);
        //_mm_store_si128((__m128i*)dst, upcased);  // for testing on CPUs where storeu is slow
        src += 16;
        dst += 16;
    } while(1);
    IACA_END

    // handle the last few bytes.  Options: scalar loop, masked store, or unaligned 16B.
    // rewriting some bytes beyond the end of the string would be easy,
    // but doing a non-atomic read-modify-write outside of the string is not safe.
    // Upcasing is idempotent, so unaligned potentially-overlapping is a good option.

    unsigned int cleanup_bytes = ffs(zero_positions) - 1;  // excluding the trailing null
    const char* last_byte = src + cleanup_bytes;  // points at the terminating '\0'

    // FIXME: copy the terminating 0 when we end at an aligned vector boundary
    // optionally special-case cleanup_bytes == 15: final aligned vector can be used.
    if (cleanup_bytes > 0) {
        if (last_byte - src_begin >= 16) {
//          IACA_START
            // if src==dest, this load overlaps with the last store:  store-forwarding stall.  Hopefully OOO execution hides it
            __m128i sv = _mm_loadu_si128( (const __m128i*)(last_byte-15) ); // includes the \0
            _mm_storeu_si128((__m128i*)(dst + cleanup_bytes - 15), upcase_si128(sv));
//          IACA_END
        } else {
            // whole string less than 16B
            // if this is common, try 64b or even 32b cleanup with movq / movd and upcase_si128
#if 1
	    // copies the trailing 0 byte.
            for (unsigned int i = 0 ; i <= cleanup_bytes ; ++i) {
                IACA_START
                dst[i] = ascii_toupper(src[i]);
            }
#else
            // gcc stupidly auto-vectorizes this, resulting in huge code bloat, but no measurable slowdown because it never runs
            for (int i = cleanup_bytes - 1 ;  i >= 0 ; --i) {
                IACA_START
                dst[i] = ascii_toupper(src[i]);
            }
            IACA_END
#endif
        }
    }

    return last_byte - src_begin;
}

// 135 chars. Merom, 40M iters, gcc, not inlined: 9.48s
size_t strtoupper_simple(char *dst, const char *src_begin) {
    const char *src = src_begin;
    while (*src)
        *(dst++) = ascii_toupper(*(src++));
    return src - src_begin;
}


char ascii_toupper_char(char c) {
    return ('a' <= c && c <= 'z') ? c^0x20 : c;    // ^ autovectorizes to PXOR: runs on more ports than paddb
//    return c ^ ('a' <= c && c <= 'z') ? 0x20 : 0;    // failed attempt to get gcc to mask the subtract vector, rather than subtract and blend.  Makes much worse code with many more constants
}

// Merom:
// 40M iterations, separate destbuf, char ascii_toupper(char)
// 16 char cmdline string. gcc 5.2 native:  inlined: 0.187s.  Not inlined: 1.52s
// 15 char cmdline string. gcc 5.2 native:  inlined: 1.14s.   Not inlined: 1.34s

// 127 char cmdline str    gcc 5.2 native:  inlined: 1.92s.   Not inlined: 2.98s
// 127 char cmdline str clang3.8  inlined: 3.56s. (5%br miss)  Not inlined: 2.82s
//  128 char cmdline str    gcc 5.2 native:  inlined: 0.94s.   Not inlined: 2.06s
//  128 char cmdline str    clang3.8 native  inlined: 1.64s.   Not inlined: 2.11s
// 129 char cmdline str    gcc 5.2 native:  inlined: 1.01s.   Not inlined: 2.07s
// 129 char cmdline str    clang3.8 native  inlined: 1.67s.   Not inlined: 2.19s

// 135 char cmdline str    gcc 5.2 native:  inlined: 1.48s.   Not inlined: 2.52s

/* gcc can only auto-vectorize loops when the number of iterations is known before the first iteration, hence strlen
 * This is a lot faster when inlined into the timing loop; I think strlen is hoisted.
 */
size_t strtoupper_autovec(char *dst, const char *src) {
    size_t len = strlen(src);
    for (size_t i=0 ; i<len ; ++i) {
        IACA_START
        dst[i] = ascii_toupper_char(src[i]);  // gcc does the vector range check with psubusb / pcmpeqb instead of pcmpgtb
    }
    IACA_END
    return len;
}

// Merom, 40M iterations, not inlined, 135 char string: 7.38s
size_t strtoupper_glibc(char *dst, const char *src_begin) {
    const char *src = src_begin;
    while (*src)
        *(dst++) = toupper(*(src++));
    return src - src_begin;
}


#ifdef INLINE_INTO_MAIN    // otherwise compile this main() in a separate file

char buf[] = "ajIjlkasfoioi1287l kjl 8u12 l1kl4;k1uj489 k1jnh24kjhk1 4142joi1u4 o1h24lkn14kljhasfhsf lkasjf lksjf lasjf ;iuo32r ;laknlkas jfdjsfa zzz";
//__attribute__((aligned(16))) char buf[] = "Klkjo" ;
__attribute__((aligned(16))) char dstbuf[4096];
int main(int argc, char **argv) {
    const char *src = buf;
    if (argc > 1) {
        src = strdup(argv[1]); // get an aligned copy.  glibc malloc happens do what we need for long-enough strings.
    }
    memset(dstbuf, 'X', 128);  // detect failure to copy terminating 0 for short strings

    puts(src);
    size_t len = strlen(src);
    size_t len2;
    for (int i = 0 ; i< 40000000; ++i) {
        len2 = STRTOUPPER(dstbuf, src);
    }

    printf("%s: strlen=%lu, mylen=%lu\n", dstbuf, len, len2);
}
#endif
	// x86 SIMD string to uppercase
	// See http://stackoverflow.com/questions/735204/convert-a-string-in-c-to-upper-case

	#include <stdio.h>
	#include <stdint.h>
	#include <string.h>
	#include <strings.h> // for ffs
	#include <ctype.h>
	#include <immintrin.h>


	/////// Timing results from a 2.4GHz Core2duo E6600 (Conroe/Merom)
	/////// -march=native includes SSSE3

	// clang-3.8 -march=native -Wall -Wextra -O3 simd-flipcase.c test-flipcase.c -DNOINLINE_INTO_MAIN -DSTRTOUPPER=strtoupper_autovec -o flipcase-clang-autovec


	// Don't worry, the IACA stuff expands to nothing unless specifically enabled
	// gcc -DIACA_MARKS -march=native -Wall -Wextra -O3 simd-flipcase.c
	#ifdef IACA_MARKS
	//#include </opt/iaca-2.1/include/iacaMarks.h>
	#define IACA_SSC_MARK( MARK_ID ) \
	__asm__ __volatile__ ( \
	"\n\t movl $"#MARK_ID", %%ebx" \
	"\n\t .byte 0x64, 0x67, 0x90" \
	: : : /* "memory" */ );
	#define IACA_START /IACA_UD_BYTES / IACA_SSC_MARK(111)
	#define IACA_END IACA_SSC_MARK(222) // IACA_UD_BYTES
	#else
	#define IACA_START
	#define IACA_END
	#endif

	//#define _mm_storeu_si128 _mm_store_si128

	// upcase all alphabetic ASCII bytes in a src vector
	//inline static
	__m128i upcase_si128(__m128i src) {
	/*
	* SSE only has a signed compare-greater, but we can still use the "unsigned
	* compare" trick by range-shifting to the bottom of the signed range
	* subtract 'a'+128, so the alphabetic characters range from -128 to -128+25 (-128+'z'-'a')
	*
	* note that adding 128 and subtracting 128 are the same thing for 8bit integers.
	* There's nowhere for the carry to go, so it's just xor (carryless add), flipping the high bit
	*/

	__m128i rangeshift = _mm_sub_epi8(src, _mm_set1_epi8('a'+128));
	__m128i nomodify = _mm_cmpgt_epi8(rangeshift, _mm_set1_epi8(-128 + 25)); // 0:lower case -1:anything else (upper case or non-alphabetic). 25 = 'z' - 'a'

	__m128i flip = _mm_andnot_si128(nomodify, _mm_set1_epi8(0x20)); // 0x20:lcase 0:non-lcase

	// just mask the XOR-mask so elements are XORed with 0 instead of 0x20
	// XOR's identity value is 0, same as addition's
	return _mm_xor_si128(src, flip);
	// it's easier to xor with 0x20 or 0 than to AND with ~0x20 or 0xFF
	}


	// define our own because this function is intentionally not locale-aware
	// char arg and return value result in clang actually wasting an insn sign-extending. But when gcc auto-vectorizes with this, it unpacks/repacks to 4B elements
	int ascii_toupper(int c) {
	return ('a' <= c && c <= 'z') ? c-0x20 : c; // - lets the compiler use LEA
	}


	// TODO: detect non-ASCII (e.g. bytes > 0x7F) and fall back to UTF-8-aware scalar code to handle multibyte variable-size characters.
	// toupper can map an ASCII char to a non-ASCII char (e.g. Turkish i -> İ, not I)
	// PCMPISTRI can check for that and the terminator at the same time, and is only 3 uops on SnB-family CPUs

	// convert to uppercase and return strlen
	// works in-place if dst = src, or as a copy-and-modify
	// if (dst != src), they must be at least 16B apart

	// Merom:
	// 40M iterations, separate destbuf, non-bloated cleanup (#if 1)
	// 15 char cmdline string. gcc 5.2 native: inlined: 1.24s. Not inlined: 1.29s
	// 15 char cmdline string. clang3.8 native inlined: 1.04s. Not inlined: 1.17s
	// 16 char cmdline string. gcc 5.2 native: inlined: 0.270s. Not inlined: 0.335s
	// 16 char cmdline string. clang3.8 native inlined: 0.216. Not inlined: 0.357s
	// 17 char cmdline string. gcc 5.2 native: inlined: 0.399s. Not inlined: 0.48s
	// 17 char cmdline string. clang3.8 native inlined: 0.383s. Not inlined: 0.45s

	// 31 char cmdline string. gcc 5.2 native: inlined: 0.391s. Not inlined: 0.479s
	// 31 char cmdline string. clang3.8 native inlined: 0.393s. Not inlined: 0.453s


	// 127 char cmdline str gcc 5.2 native: inlined: 0.986s. Not inlined: 0.925s
	// 127 char cmdline str clang3.8 native inlined: 0.822s. Not inlined: 0.888s
	// 128 char cmdline str gcc 5.2 native: inlined: 0.885s. Not inlined: 0.931s
	// 128 char cmdline str clang3.8 native inlined: 0.739. Not inlined: 0.829s
	// 129 char cmdline str gcc 5.2 native: inlined: 0.966s Not inlined: 1.02s
	// 129 char cmdline str clang3.8 native inlined: 1.17s. Not inlined: 1.23s

	// 135 char cmdline str gcc 5.2 native: inlined: 0.964s Not inlined: 1.005s
	// 135 char cmdline str clang3.8 native inlined: 0.905s Not inlined: 0.966s



	//__attribute__((noinline))
	size_t strtoupper_sse2(char dst, const char src_begin) {
	const char *src = src_begin;
	// scalar until the src pointer is aligned
	while ( (0xf & (uintptr_t)src) && *src ) {
	IACA_START
	(dst++) = ascii_toupper((src++));
	}
	IACA_END

	if (!*src)
	return src - src_begin;

	// current position (p) is now 16B-aligned, and we're not at the end
	int zero_positions;
	IACA_START
	do {
	__m128i sv = _mm_load_si128( (const __m128i*)src );
	// TODO: SSE4.2 PCMPISTRI or PCMPISTRM version to combine the lower-case and '\0' detection?

	__m128i nullcheck = _mm_cmpeq_epi8(_mm_setzero_si128(), sv);
	zero_positions = _mm_movemask_epi8(nullcheck);
	// TODO: unroll so the null-byte check takes less overhead
	if (zero_positions)
	break;

	__m128i upcased = upcase_si128(sv); // doing this before the loop break lets gcc realize that the constants are still in registers for the unaligned cleanup version. But it leads to more wasted insns in the early-out case

	_mm_storeu_si128((__m128i*)dst, upcased);
	//_mm_store_si128((__m128i*)dst, upcased); // for testing on CPUs where storeu is slow
	src += 16;
	dst += 16;
	} while(1);
	IACA_END

	// handle the last few bytes. Options: scalar loop, masked store, or unaligned 16B.
	// rewriting some bytes beyond the end of the string would be easy,
	// but doing a non-atomic read-modify-write outside of the string is not safe.
	// Upcasing is idempotent, so unaligned potentially-overlapping is a good option.

	unsigned int cleanup_bytes = ffs(zero_positions) - 1; // excluding the trailing null
	const char* last_byte = src + cleanup_bytes; // points at the terminating '\0'

	// FIXME: copy the terminating 0 when we end at an aligned vector boundary
	// optionally special-case cleanup_bytes == 15: final aligned vector can be used.
	if (cleanup_bytes > 0) {
	if (last_byte - src_begin >= 16) {
	// IACA_START
	// if src==dest, this load overlaps with the last store: store-forwarding stall. Hopefully OOO execution hides it
	__m128i sv = _mm_loadu_si128( (const __m128i*)(last_byte-15) ); // includes the \0
	_mm_storeu_si128((__m128i*)(dst + cleanup_bytes - 15), upcase_si128(sv));
	// IACA_END
	} else {
	// whole string less than 16B
	// if this is common, try 64b or even 32b cleanup with movq / movd and upcase_si128
	#if 1
	// copies the trailing 0 byte.
	for (unsigned int i = 0 ; i <= cleanup_bytes ; ++i) {
	IACA_START
	dst[i] = ascii_toupper(src[i]);
	}
	#else
	// gcc stupidly auto-vectorizes this, resulting in huge code bloat, but no measurable slowdown because it never runs
	for (int i = cleanup_bytes - 1 ; i >= 0 ; --i) {
	IACA_START
	dst[i] = ascii_toupper(src[i]);
	}
	IACA_END
	#endif
	}
	}

	return last_byte - src_begin;
	}

	// 135 chars. Merom, 40M iters, gcc, not inlined: 9.48s
	size_t strtoupper_simple(char dst, const char src_begin) {
	const char *src = src_begin;
	while (*src)
	(dst++) = ascii_toupper((src++));
	return src - src_begin;
	}



	char ascii_toupper_char(char c) {
	return ('a' <= c && c <= 'z') ? c^0x20 : c; // ^ autovectorizes to PXOR: runs on more ports than paddb
	// return c ^ ('a' <= c && c <= 'z') ? 0x20 : 0; // failed attempt to get gcc to mask the subtract vector, rather than subtract and blend. Makes much worse code with many more constants
	}

	// Merom:
	// 40M iterations, separate destbuf, char ascii_toupper(char)
	// 16 char cmdline string. gcc 5.2 native: inlined: 0.187s. Not inlined: 1.52s
	// 15 char cmdline string. gcc 5.2 native: inlined: 1.14s. Not inlined: 1.34s

	// 127 char cmdline str gcc 5.2 native: inlined: 1.92s. Not inlined: 2.98s
	// 127 char cmdline str clang3.8 inlined: 3.56s. (5%br miss) Not inlined: 2.82s
	// 128 char cmdline str gcc 5.2 native: inlined: 0.94s. Not inlined: 2.06s
	// 128 char cmdline str clang3.8 native inlined: 1.64s. Not inlined: 2.11s
	// 129 char cmdline str gcc 5.2 native: inlined: 1.01s. Not inlined: 2.07s
	// 129 char cmdline str clang3.8 native inlined: 1.67s. Not inlined: 2.19s

	// 135 char cmdline str gcc 5.2 native: inlined: 1.48s. Not inlined: 2.52s

	/* gcc can only auto-vectorize loops when the number of iterations is known before the first iteration, hence strlen
	* This is a lot faster when inlined into the timing loop; I think strlen is hoisted.
	*/
	size_t strtoupper_autovec(char dst, const char src) {
	size_t len = strlen(src);
	for (size_t i=0 ; i<len ; ++i) {
	IACA_START
	dst[i] = ascii_toupper_char(src[i]); // gcc does the vector range check with psubusb / pcmpeqb instead of pcmpgtb
	}
	IACA_END
	return len;
	}

	// Merom, 40M iterations, not inlined, 135 char string: 7.38s
	size_t strtoupper_glibc(char dst, const char src_begin) {
	const char *src = src_begin;
	while (*src)
	(dst++) = toupper((src++));
	return src - src_begin;
	}


	#ifdef INLINE_INTO_MAIN // otherwise compile this main() in a separate file

	char buf[] = "ajIjlkasfoioi1287l kjl 8u12 l1kl4;k1uj489 k1jnh24kjhk1 4142joi1u4 o1h24lkn14kljhasfhsf lkasjf lksjf lasjf ;iuo32r ;laknlkas jfdjsfa zzz";
	//__attribute__((aligned(16))) char buf[] = "Klkjo" ;
	__attribute__((aligned(16))) char dstbuf[4096];
	int main(int argc, char **argv) {
	const char *src = buf;
	if (argc > 1) {
	src = strdup(argv[1]); // get an aligned copy. glibc malloc happens do what we need for long-enough strings.
	}
	memset(dstbuf, 'X', 128); // detect failure to copy terminating 0 for short strings

	puts(src);
	size_t len = strlen(src);
	size_t len2;
	for (int i = 0 ; i< 40000000; ++i) {
	len2 = STRTOUPPER(dstbuf, src);
	}

	printf("%s: strlen=%lu, mylen=%lu\n", dstbuf, len, len2);
	}
	#endif