Skip to content

Instantly share code, notes, and snippets.

@jwbensley
Created June 26, 2020 08:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwbensley/2e67f096ff5af74bbcba6f60fa06a481 to your computer and use it in GitHub Desktop.
Save jwbensley/2e67f096ff5af74bbcba6f60fa06a481 to your computer and use it in GitHub Desktop.
memcpy() using cache and page aligned values
#include <inttypes.h> // uint*_t
#include <stdio.h> // perror(), printf()
#include <stdlib.h> // memcpy, posix_memalign()
#include <string.h> // malloc()
#include <time.h> // clock_t, CLOCKS_PER_SEC
#include <unistd.h> // getpagesize()
static inline void memcpy_aligned(void *to, const void *from, size_t len) {
if (len <= 64) {
memcpy(to, from, 64);
} else if (len <= 128) {
memcpy(to, from, 64);
memcpy((uint8_t *)to + 64, (uint8_t *)from + 64, 64);
} else {
size_t offset;
for (offset = 0; offset < len; offset += 64)
memcpy((uint8_t *)to + offset, (uint8_t *)from + offset, 64);
}
}
int main(int argc, char **argv) {
clock_t duration, end, start;
double time_taken;
uint32_t loops = 100000;
int sizes[9] = {32, 64, 96, 128, 256, 512, 1024, 2048, 4096};
printf("Loop count is %d\n", loops);
printf("Page size is %d\n\n", getpagesize());
void *src, *dst, *buf; // buf is for n-way cache associated CPUs
if (posix_memalign(&buf, getpagesize(), 16384) != 0) {
perror("posix_memalign failed!\n");
return 1;
}
src = malloc(1024);
dst = malloc(1024);
printf("Malloc size is 1024, src=%p, dst=%p\n", src, dst);
for (uint8_t i = 0; i < 7; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy_aligned(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy_aligned(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
free(src);
free(dst);
printf("\n");
if (posix_memalign(&src, getpagesize(), 1024) != 0) {
perror("posix_memalign failed!\n");
return 1;
}
if (posix_memalign(&dst, getpagesize(), 1024) != 0) {
perror("posix_memalign failed!\n");
return 1;
}
printf("Page aligned malloc size is 1024, src=%p, dst=%p\n", src, dst);
for (uint8_t i = 0; i < 7; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy_aligned(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy_aligned(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
free(src);
free(dst);
printf("\n");
src = malloc(4096);
//buf = malloc(4096);
dst = malloc(4096);
printf("Malloc size is 4096, src=%p, dst=%p\n", src, dst);
//printf("buf=%p\n", buf);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy_aligned(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy_aligned(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
free(src);
//free(buf);
free(dst);
printf("\n");
if (posix_memalign(&src, getpagesize(), 4096) != 0) {
perror("posix_memalign(src) failed!\n");
return 1;
}
//if (posix_memalign(&buf, 2048, 2048) != 0) {
// perror("posix_memalign(buf) failed!\n");
// return 1;
//}
if (posix_memalign(&dst, getpagesize(), 4096) != 0) {
perror("posix_memalign(dst) failed!\n");
return 1;
}
printf("Page aligned malloc size is 4096, src=%p, dst=%p\n", src, dst);
//printf("buf=%p\n", buf);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy_aligned(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy_aligned(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
free(src);
//free(buf);
free(dst);
printf("\n");
src = buf;
dst = buf+4096;
printf("Cache aligned malloc, offset is 0, buf=%p, src=%p, dst=%p\n", buf, src, dst);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
printf("\n");
src = buf;
dst = buf+5120;
printf("Cache aligned malloc, offset is +1024, buf=%p, src=%p, dst=%p\n", buf, src, dst);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
printf("\n");
src = buf;
dst = buf+6144;
printf("Cache aligned malloc, offset is +2048, buf=%p, src=%p, dst=%p\n", buf, src, dst);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
printf("\n");
src = buf;
dst = buf+7168;
printf("Cache aligned malloc, offset is +3072, buf=%p, src=%p, dst=%p\n", buf, src, dst);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
printf("\n");
src = buf;
dst = buf+8192;
printf("Cache aligned malloc, offset is +4096, buf=%p, src=%p, dst=%p\n", buf, src, dst);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
printf("\n");
src = buf;
dst = buf+9216;
printf("Cache aligned malloc, offset is +5120, buf=%p, src=%p, dst=%p\n", buf, src, dst);
for (uint8_t i = 0; i < 9; i += 1) {
start = clock();
for (uint32_t j = 0; j < loops; j += 1){
memcpy(dst, src, sizes[i]);
}
end = clock();
duration = end - start;
time_taken = ((double)duration)/CLOCKS_PER_SEC;
printf("memcpy(%d) took %f seconds to execute \n", sizes[i], time_taken);
}
printf("\n");
free(buf);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment