Skip to content

Instantly share code, notes, and snippets.

@jsnyder
Created January 18, 2012 22:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jsnyder/1636109 to your computer and use it in GitHub Desktop.
Save jsnyder/1636109 to your computer and use it in GitHub Desktop.
newlib/libc/machine/arm/memcpy.c from CodeSourcery 2011.09 release based on newlib 1.18
/* Copyright (c) 2009 CodeSourcery, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of CodeSourcery nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "arm_asm.h"
#include <string.h>
#include <stdint.h>
#include <stddef.h>
/* Standard operations for word-sized values. */
#define WORD_REF(ADDRESS, OFFSET) \
*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
#define WORD_COPY(OUT, IN, OFFSET) \
WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET)
/* On processors with NEON, we use 128-bit vectors. Also,
we need to include arm_neon.h to use these. */
#if defined(__ARM_NEON__)
#include <arm_neon.h>
#define WORD_TYPE uint8x16_t
#define WORD_SIZE 16
#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
/* On ARM processors with 64-bit ldrd instructions, we use those,
except on Cortex-M* where benchmarking has shown them to
be slower. */
#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
#define WORD_TYPE uint64_t
#define WORD_SIZE 8
#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
/* On everything else, we use 32-bit loads and stores, and
do not use prefetching. */
#else
#define WORD_TYPE uint32_t
#define WORD_SIZE 4
#define MAYBE_PREFETCH(IN)
#endif
/* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */
#define SHORTWORD_TYPE uint32_t
#define SHORTWORD_SIZE 4
#define SHORTWORD_REF(ADDRESS, OFFSET) \
*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
#define SHORTWORD_COPY(OUT, IN, OFFSET) \
SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET)
/* Shifting directionality depends on endianness. */
#ifdef __ARMEB__
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
((IN0) << ((OFFSET)*8)) | ((IN1) >> (SHORTWORD_SIZE*8 - (OFFSET)*8))
#else
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
((IN0) >> ((OFFSET)*8)) | ((IN1) << (SHORTWORD_SIZE*8 - (OFFSET)*8))
#endif
_PTR
_DEFUN (memcpy, (OUT, IN, N),
_PTR OUT _AND
_CONST _PTR IN _AND
size_t N)
{
void* OUT0 = OUT;
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
const char* OUT_end = (char*)OUT + N;
while ((char*)OUT < OUT_end) {
*((char*)OUT) = *((char*)IN);
OUT++;
IN++;
}
return OUT0;
#else
/* Handle short strings and immediately return. */
if (__builtin_expect(N < SHORTWORD_SIZE, 1)) {
size_t i = 0;
while (i < N) {
((char*)OUT)[i] = ((char*)IN)[i];
i++;
}
return OUT;
}
const char* OUT_end = (char*)OUT + N;
/* Align OUT to SHORTWORD_SIZE. */
while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) {
*(char*) (OUT++) = *(char*) (IN++);
}
if ((uintptr_t) IN % SHORTWORD_SIZE == 0) {
#if WORD_SIZE > SHORTWORD_SIZE
/* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE. */
if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) {
while ((uintptr_t)OUT % WORD_SIZE != 0) {
SHORTWORD_COPY(OUT, IN, 0);
OUT += SHORTWORD_SIZE;
IN += SHORTWORD_SIZE;
}
if ((uintptr_t) IN % WORD_SIZE == 0) {
#endif /* WORD_SIZE > SHORTWORD_SIZE */
#if defined(__ARM_NEON__)
/* Testing on Cortex-A8 indicates that the following idiom
produces faster assembly code when doing vector copies,
but not when doing regular copies. */
size_t i = 0;
N = OUT_end - (char*)OUT;
MAYBE_PREFETCH(IN + 64);
MAYBE_PREFETCH(IN + 128);
MAYBE_PREFETCH(IN + 192);
if (N >= 640) {
MAYBE_PREFETCH(IN + 256);
MAYBE_PREFETCH(IN + 320);
MAYBE_PREFETCH(IN + 384);
MAYBE_PREFETCH(IN + 448);
MAYBE_PREFETCH(IN + 512);
MAYBE_PREFETCH(IN + 576);
MAYBE_PREFETCH(IN + 640);
MAYBE_PREFETCH(IN + 704);
/* We phrase the loop condition in this way so that the
i + WORD_SIZE * 16 value can be reused to increment i. */
while (i + WORD_SIZE * 16 <= N - 640) {
MAYBE_PREFETCH(IN + 768);
MAYBE_PREFETCH(IN + 832);
MAYBE_PREFETCH(IN + 896);
MAYBE_PREFETCH(IN + 960);
WORD_COPY(OUT, IN, i);
WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
i += WORD_SIZE * 16;
}
}
while (i + WORD_SIZE * 16 <= N) {
WORD_COPY(OUT, IN, i);
WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
i += WORD_SIZE * 16;
}
while (i + WORD_SIZE * 4 <= N) {
WORD_COPY(OUT, IN, i);
WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
i += WORD_SIZE * 4;
}
while (i + WORD_SIZE <= N) {
WORD_COPY(OUT, IN, i);
i += WORD_SIZE;
}
OUT += i;
IN += i;
#else /* not defined(__ARM_NEON__) */
/* Note: 16-times unrolling is about 20% faster than 4-times
unrolling on both ARM Cortex-A8 and Cortex-M3. */
MAYBE_PREFETCH(IN + 64);
MAYBE_PREFETCH(IN + 128);
MAYBE_PREFETCH(IN + 192);
while (OUT_end - (char*)OUT >= WORD_SIZE * 16) {
MAYBE_PREFETCH(IN + 256);
MAYBE_PREFETCH(IN + 320);
WORD_COPY(OUT, IN, 0);
WORD_COPY(OUT, IN, WORD_SIZE * 1);
WORD_COPY(OUT, IN, WORD_SIZE * 2);
WORD_COPY(OUT, IN, WORD_SIZE * 3);
WORD_COPY(OUT, IN, WORD_SIZE * 4);
WORD_COPY(OUT, IN, WORD_SIZE * 5);
WORD_COPY(OUT, IN, WORD_SIZE * 6);
WORD_COPY(OUT, IN, WORD_SIZE * 7);
WORD_COPY(OUT, IN, WORD_SIZE * 8);
WORD_COPY(OUT, IN, WORD_SIZE * 9);
WORD_COPY(OUT, IN, WORD_SIZE * 10);
WORD_COPY(OUT, IN, WORD_SIZE * 11);
WORD_COPY(OUT, IN, WORD_SIZE * 12);
WORD_COPY(OUT, IN, WORD_SIZE * 13);
WORD_COPY(OUT, IN, WORD_SIZE * 14);
WORD_COPY(OUT, IN, WORD_SIZE * 15);
OUT += WORD_SIZE * 16;
IN += WORD_SIZE * 16;
}
while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) {
WORD_COPY(OUT, IN, 0);
WORD_COPY(OUT, IN, WORD_SIZE * 1);
WORD_COPY(OUT, IN, WORD_SIZE * 2);
WORD_COPY(OUT, IN, WORD_SIZE * 3);
OUT += WORD_SIZE * 4;
IN += WORD_SIZE * 4;
}
while (WORD_SIZE <= OUT_end - (char*)OUT) {
WORD_COPY(OUT, IN, 0);
OUT += WORD_SIZE;
IN += WORD_SIZE;
}
#endif /* not defined(__ARM_NEON__) */
#if WORD_SIZE > SHORTWORD_SIZE
} else { /* if IN is not WORD_SIZE aligned */
while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) {
SHORTWORD_COPY(OUT, IN, 0);
SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1);
SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2);
SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3);
OUT += SHORTWORD_SIZE * 4;
IN += SHORTWORD_SIZE * 4;
}
} /* end if IN is not WORD_SIZE aligned */
} /* end if N >= WORD_SIZE */
while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
SHORTWORD_COPY(OUT, IN, 0);
OUT += SHORTWORD_SIZE;
IN += SHORTWORD_SIZE;
}
#endif /* WORD_SIZE > SHORTWORD_SIZE */
} else { /* if IN is not SHORTWORD_SIZE aligned */
ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE;
SHORTWORD_TYPE temp1, temp2;
temp1 = SHORTWORD_REF(IN, -misalign);
/* Benchmarking indicates that unrolling this loop doesn't
produce a measurable performance improvement on ARM. */
while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
IN += SHORTWORD_SIZE;
temp2 = SHORTWORD_REF(IN, -misalign);
SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign);
temp1 = temp2;
OUT += SHORTWORD_SIZE;
}
} /* end if IN is not SHORTWORD_SIZE aligned */
while ((char*)OUT < OUT_end) {
*((char*)OUT) = *((char*)IN);
OUT++;
IN++;
}
return OUT0;
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment