Skip to content

Instantly share code, notes, and snippets.

@battlesnake
Last active October 19, 2022 07:31
Show Gist options
  • Save battlesnake/2cce48284e2230ed05c80272f2419e72 to your computer and use it in GitHub Desktop.
Save battlesnake/2cce48284e2230ed05c80272f2419e72 to your computer and use it in GitHub Desktop.
STM32H750 DMA-based memcpy performance
Max clocks (480MHz CPU/SysTick, 240MHz AXI+AHB1+AHB2)
Using DMA2 (in D2 domain)
Copying 32-bit non-bursts
================================================================================
== D1 AXI SRAM to D1 AXI SRAM
================================================================================
Copying 64 bytes from D1 AXI SRAM to D1 AXI SRAM
162 cycles to start DMA copy
263 cycles until first byte changed
723 cycles until last byte changed
40 kB/s overall
63 kB/s since first byte changed
158 kB/s for memcpy via CPU
0.2x relative to memcpy
Copying 256 bytes from D1 AXI SRAM to D1 AXI SRAM
162 cycles to start DMA copy
263 cycles until first byte changed
2363 cycles until last byte changed
49 kB/s overall
55 kB/s since first byte changed
205 kB/s for memcpy via CPU
0.2x relative to memcpy
Copying 4096 bytes from D1 AXI SRAM to D1 AXI SRAM
162 cycles to start DMA copy
263 cycles until first byte changed
35163 cycles until last byte changed
53 kB/s overall
53 kB/s since first byte changed
227 kB/s for memcpy via CPU
0.2x relative to memcpy
Copying 65536 bytes from D1 AXI SRAM to D1 AXI SRAM
162 cycles to start DMA copy
263 cycles until first byte changed
559939 cycles until last byte changed
53 kB/s overall
53 kB/s since first byte changed
228 kB/s for memcpy via CPU
0.2x relative to memcpy
================================================================================
== D2 AHB SRAM to D2 AHB SRAM
================================================================================
Copying 64 bytes from D2 AHB SRAM to D2 AHB SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
371 cycles until last byte changed
78 kB/s overall
281 kB/s since first byte changed
161 kB/s for memcpy via CPU
0.4x relative to memcpy
Copying 256 bytes from D2 AHB SRAM to D2 AHB SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
943 cycles until last byte changed
124 kB/s overall
173 kB/s since first byte changed
207 kB/s for memcpy via CPU
0.5x relative to memcpy
Copying 4096 bytes from D2 AHB SRAM to D2 AHB SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
13063 cycles until last byte changed
143 kB/s overall
146 kB/s since first byte changed
227 kB/s for memcpy via CPU
0.6x relative to memcpy
Copying 65536 bytes from D2 AHB SRAM to D2 AHB SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
206931 cycles until last byte changed
144 kB/s overall
145 kB/s since first byte changed
228 kB/s for memcpy via CPU
0.6x relative to memcpy
================================================================================
== D1 AXI SRAM to D2 AHB SRAM
================================================================================
Copying 64 bytes from D1 AXI SRAM to D2 AHB SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
427 cycles until last byte changed
68 kB/s overall
183 kB/s since first byte changed
161 kB/s for memcpy via CPU
0.4x relative to memcpy
Copying 256 bytes from D1 AXI SRAM to D2 AHB SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
1215 cycles until last byte changed
96 kB/s overall
123 kB/s since first byte changed
207 kB/s for memcpy via CPU
0.4x relative to memcpy
Copying 4096 bytes from D1 AXI SRAM to D2 AHB SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
16335 cycles until last byte changed
114 kB/s overall
116 kB/s since first byte changed
227 kB/s for memcpy via CPU
0.5x relative to memcpy
Copying 65536 bytes from D1 AXI SRAM to D2 AHB SRAM
162 cycles to start DMA copy
265 cycles until first byte changed
258253 cycles until last byte changed
116 kB/s overall
116 kB/s since first byte changed
228 kB/s for memcpy via CPU
0.5x relative to memcpy
Copying 131072 bytes from D1 AXI SRAM to D2 AHB SRAM
162 cycles to start DMA copy
265 cycles until first byte changed
516307 cycles until last byte changed
116 kB/s overall
116 kB/s since first byte changed
228 kB/s for memcpy via CPU
0.5x relative to memcpy
================================================================================
== D2 AHB SRAM to D1 AXI SRAM
================================================================================
Copying 64 bytes from D2 AHB SRAM to D1 AXI SRAM
162 cycles to start DMA copy
267 cycles until first byte changed
479 cycles until last byte changed
61 kB/s overall
138 kB/s since first byte changed
158 kB/s for memcpy via CPU
0.3x relative to memcpy
Copying 256 bytes from D2 AHB SRAM to D1 AXI SRAM
162 cycles to start DMA copy
265 cycles until first byte changed
1341 cycles until last byte changed
87 kB/s overall
108 kB/s since first byte changed
205 kB/s for memcpy via CPU
0.4x relative to memcpy
Copying 4096 bytes from D2 AHB SRAM to D1 AXI SRAM
162 cycles to start DMA copy
265 cycles until first byte changed
18621 cycles until last byte changed
100 kB/s overall
102 kB/s since first byte changed
227 kB/s for memcpy via CPU
0.4x relative to memcpy
Copying 65536 bytes from D2 AHB SRAM to D1 AXI SRAM
162 cycles to start DMA copy
265 cycles until first byte changed
295123 cycles until last byte changed
101 kB/s overall
101 kB/s since first byte changed
228 kB/s for memcpy via CPU
0.4x relative to memcpy
Copying 131072 bytes from D2 AHB SRAM to D1 AXI SRAM
162 cycles to start DMA copy
265 cycles until first byte changed
590013 cycles until last byte changed
101 kB/s overall
101 kB/s since first byte changed
228 kB/s for memcpy via CPU
0.4x relative to memcpy
struct ramdef
{
const char *name;
volatile uint8_t *ptr;
uint32_t size;
};
#define FREQ (480000000ULL)
#define MEM_SIZE (0x28000)
static volatile uint8_t d1buf[MEM_SIZE] _dma_align _d1ram_bss;
static volatile uint8_t d2buf[MEM_SIZE] _dma_align _d2ram_bss;
const struct ramdef d1 = { .name = "D1 AXI SRAM", .ptr = d1buf, .size = 0x20000 };
const struct ramdef d1_a = { .name = "D1 AXI SRAM", .ptr = d1buf, .size = 0x10000 };
const struct ramdef d1_b = { .name = "D1 AXI SRAM", .ptr = d1buf + 0x10000, .size = 0x10000 };
const struct ramdef d2 = { .name = "D2 AHB SRAM", .ptr = d2buf, .size = 0x20000 };
const struct ramdef d2_a = { .name = "D2 AHB SRAM", .ptr = d2buf, .size = 0x10000 };
const struct ramdef d2_b = { .name = "D2 AHB SRAM", .ptr = d2buf + 0x10000, .size = 0x10000 };
static void run_test(const struct ramdef *src, const struct ramdef *dst, size_t size)
{
if (src->size < size || dst->size < size) {
return;
}
memset(src->ptr, 0x55, size);
__DSB();
__ISB();
volatile uint32_t tx = read_cycle_counter();
__DSB();
__ISB();
memset(dst->ptr, 0x00, size);
__DSB();
__ISB();
volatile uint32_t ty = read_cycle_counter();
__DSB();
__ISB();
/* Cache maintenance - flush */
dma_pre_transmit(src->ptr, size);
dma_pre_transmit(dst->ptr, size);
__DSB();
__ISB();
volatile uint32_t t0 = read_cycle_counter();
__DSB();
__ISB();
dma_memcpy_start(0, dst->ptr, src->ptr, size);
__DSB();
__ISB();
volatile uint32_t t1 = read_cycle_counter();
__DSB();
__ISB();
do {
dma_pre_receive(&dst->ptr[0], 32);
} while (dst->ptr[0] != 0x55);
__DSB();
__ISB();
volatile uint32_t t2 = read_cycle_counter();
__DSB();
__ISB();
do {
dma_pre_receive(&dst->ptr[size - 32], 32);
} while (dst->ptr[size - 1] != 0x55);
__DSB();
__ISB();
volatile uint32_t t3 = read_cycle_counter();
__DSB();
__ISB();
uint64_t rate_30 = ((uint64_t) size * FREQ / (t3 - t0) / 1048576);
uint64_t rate_32 = ((uint64_t) size * FREQ / (t3 - t2) / 1048576);
uint64_t rate_yx = ((uint64_t) size * FREQ / (ty - tx) / 1048576);
uint32_t speedup = rate_30 * 10 / rate_yx;
isr_printf("Copying %u bytes from %s to %s\n", size, src->name, dst->name);
isr_printf(" %u cycles to start DMA copy\n", t1 - t0);
isr_printf(" %u cycles until first byte changed\n", t2 - t0);
isr_printf(" %u cycles until last byte changed\n", t3 - t0);
isr_printf(" %llu kB/s overall\n", rate_30);
isr_printf(" %llu kB/s since first byte changed\n", rate_32);
isr_printf(" %llu kB/s for memcpy via CPU\n", rate_yx);
isr_printf(" %u.%ux relative to memcpy\n", speedup / 10, speedup % 10);
isr_printf("\n");
}
static void run_tests(const struct ramdef *src, const struct ramdef *dst)
{
isr_printf("\n");
isr_printf("================================================================================\n");
isr_printf("== %s to %s\n", src->name, dst->name);
isr_printf("================================================================================\n");
run_test(src, dst, 64);
run_test(src, dst, 256);
run_test(src, dst, 4096);
run_test(src, dst, 65536);
run_test(src, dst, 131072);
isr_printf("\n");
}
_noreturn
void entry_point()
{
dma_mem_init();
serial_init();
serial_tx_init();
run_tests(&d1_a, &d1_b);
run_tests(&d2_a, &d2_b);
run_tests(&d1, &d2);
run_tests(&d2, &d1);
while (1) ;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment