Skip to content

Instantly share code, notes, and snippets.

@beru
Last active June 16, 2019 19:55
Show Gist options
  • Save beru/d289ac9fbf0d9ba7294e0546ca6c9392 to your computer and use it in GitHub Desktop.
Save beru/d289ac9fbf0d9ba7294e0546ca6c9392 to your computer and use it in GitHub Desktop.
GR_PEACHでメモリコピーの時間測定
// GR-PEACH mbed style project (template V2.03)
// main.cpp: Public domain
#include "mbed.h"
#include "Timer.h"
#include "dma_if.h"
#include "r_bsp_cmn.h"
#include "dcache-control.h"
#include <stdint.h>
#include <stdlib.h>
#include <arm_neon.h>
DigitalOut led1(LED1);
DigitalOut led2(LED2);
DigitalOut led3(LED3);
DigitalOut led4(LED4);
#define WIDTH 640
#define HEIGHT 480
uint8_t srcBuff[1024 * 1024 * 4];
uint8_t dstBuff[1024 * 1024 * 4];
Timer t;
inline
void copy128(uint8_t*& __restrict dst, const uint8_t*& __restrict src)
{
__asm__ (
"vldm %1!, {d0-d7}\n\t"
"vldm %1!, {d8-d15}\n\t"
"vstm %0!, {d0-d7}\n\t"
"vstm %0!, {d8-d15}\n\t"
: "=r"(dst), "=r"(src) : "0"(dst), "1"(src)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "memory" );
}
static
void copyImage()
{
const int n128 = WIDTH / 128;
for (int i=0; i<HEIGHT; ++i) {
#if 0
memcpy(&dstBuff[i*WIDTH], &srcBuff[i*WIDTH], WIDTH);
#elif 0
for (int j=0; j<WIDTH; ++j) {
dstBuff[i*WIDTH+j] = srcBuff[i*WIDTH+j];
}
#elif 0
const uint8_t* __restrict pSrc = &srcBuff[i*WIDTH];
uint8_t* __restrict pDst = &dstBuff[i*WIDTH];
int j;
for (j=0; j<n128; ++j) {
__builtin_prefetch(pSrc + 128);
__builtin_prefetch(pSrc + 128 + 64);
uint32x4x4_t v0,v1;
v0 = vld4q_u32((uint32_t*)pSrc); pSrc += 64;
v1 = vld4q_u32((uint32_t*)pSrc); pSrc += 64;
vst4q_u32((uint32_t*)pDst, v1); pDst += 64;
vst4q_u32((uint32_t*)pDst, v0); pDst += 64;
}
for (j*=128; j<WIDTH; ++j) {
*pDst++ = *pSrc++;
}
#elif 1
const uint8_t* __restrict pSrc = &srcBuff[i*WIDTH];
uint8_t* __restrict pDst = &dstBuff[i*WIDTH];
int j;
for (j=0; j<n128; ++j) {
__builtin_prefetch(pSrc + 128);
__builtin_prefetch(pSrc + 128 + 64);
copy128(pDst, pSrc);
}
for (j*=128; j<WIDTH; ++j) {
*pDst++ = *pSrc++;
}
#endif
}
}
static
void copyData(uint8_t* __restrict pDst,
const uint8_t* __restrict pSrc,
size_t len)
{
const int n128 = len / 128;
int j;
#if 0
for (j=0; j<n128; ++j) {
__builtin_prefetch(pSrc + 128);
__builtin_prefetch(pSrc + 128 + 64);
uint32x4x4_t v0,v1;
v0 = vld4q_u32((uint32_t*)pSrc); pSrc += 64;
v1 = vld4q_u32((uint32_t*)pSrc); pSrc += 64;
vst4q_u32((uint32_t*)pDst, v1); pDst += 64;
vst4q_u32((uint32_t*)pDst, v0); pDst += 64;
}
#else
for (j=0; j<n128; ++j) {
__builtin_prefetch(pSrc + 128);
__builtin_prefetch(pSrc + 128 + 64);
copy128(pDst, pSrc);
}
#endif
for (j*=128; j<len; ++j) {
*pDst++ = *pSrc++;
}
}
static
void dma_callback(union sigval const param)
{
t.stop();
printf("dma_callback\r\n");
}
static
int countSum(size_t len)
{
int sum = 0;
for (int j=0; j<len; ++j) {
sum += dstBuff[j];
}
return sum;
}
int main() {
printf("blinky\r\n");
memset(srcBuff, 1, sizeof(srcBuff));
memset(dstBuff, 0, sizeof(dstBuff));
// dcache_clean(srcBuff, sizeof(srcBuff));
// dcache_invalid(dstBuff, sizeof(dstBuff));
int ret;
#if 0
R_BSP_CMN_Init();
#if 0
dma_drv_init_t dmaInit = {0};
AIOCB aiocb;
dmaInit.channel[0] = true;
dmaInit.p_aio = &aiocb;
ret = R_DMA_Init(&dmaInit, NULL);
if (ret == -1)
printf("R_DMA_Init failed.\r\n");
#endif
int ch = R_DMA_Alloc(-1, NULL);
if (ch == -1)
printf("R_DMA_Alloc failed.\r\n");
dma_ch_setup_t setup;
setup.resource = DMA_RS_OSTIM1;
setup.direction = DMA_REQ_SRC;
setup.dst_width = DMA_UNIT_128;
setup.src_width = DMA_UNIT_128;
setup.dst_cnt = DMA_ADDR_INCREMENT;
setup.src_cnt = DMA_ADDR_INCREMENT;
AIOCB aiocb;
aiocb.aio_sigevent.sigev_notify = SIGEV_THREAD;
aiocb.aio_sigevent.sigev_value.sival_int = ch;
aiocb.aio_sigevent.sigev_notify_function = &dma_callback;
setup.p_aio = &aiocb;
ret = R_DMA_Setup(ch, &setup, NULL);
if (ret == -1)
printf("R_DMA_Setup failed.\r\n");
dma_trans_data_t data;
data.src_addr = srcBuff;
data.dst_addr = dstBuff;
data.count = sizeof(srcBuff);
t.reset();
t.start();
ret = R_DMA_Start(ch, &data, NULL);
if (ret == -1)
printf("R_DMA_Start failed.\r\n");
wait(1.0);
printf("%f\r\n", t.read() * 1000.0);
#endif
while(1) {
#if 0
led1 = 1;
wait(0.2);
led2 = 1;
wait(0.2);
led3 = 1;
wait(0.2);
led4 = 1;
wait(0.2);
led1 = 0;
wait(0.2);
led2 = 0;
wait(0.2);
led3 = 0;
wait(0.2);
led4 = 0;
wait(0.2);
#endif
#if 1
size_t len = 64;
for (int i=0; i<16; ++i) {
t.reset();
t.start();
copyData(dstBuff, srcBuff, len);
t.stop();
printf("%d\t%f\r\n", len, t.read() * 1000.0);
int sum = countSum(len);
if (sum != len) {
printf("invalid count. %d\r\n", sum);
}
len *= 2;
if (len > sizeof(dstBuff)) {
break;
}
}
printf("\r\n");
#elif 0
t.start();
copyImage();
t.stop();
printf("%f\r\n", t.read() * 1000.0);
#else
int sum = countSum(sizeof(dstBuff));
printf("sum : %d\r\n", sum);
wait(0.1);
#endif
}
}
@beru
Copy link
Author

beru commented Jun 16, 2019

https://github.com/d-kato/mbed-gr-libs/blob/cee9705b4e56aff62f3135e586b9817133635b73/R_BSP/RenesasBSP/drv_src/TARGET_RZ_A1XX/dma/dma.c#L667

新しい mbed-gr-libs のソースだと割込みの優先度を指定してる。でも Mbed OS のオーバーヘッドなんてたかがしれてるだろうしなぁ…。

ちなみに自分が mbed-gr-libs の dma.c ファイル中の DMA_Start 関数を改変して追加したコードは下記のようなもの。

    gb_info_drv.info_ch[channel].p_dma_ch_reg->CHCFG_n |= 0x400 | 0x400000;
    gb_info_drv.info_ch[channel].p_dma_ch_reg->CHCTRL_n = CHCTRL_SET_STG;

あとDMA転送完了のコールバック関数内で printf でタイマーの経過時間を出力しようとすると固まってしまう。
割込み側では書式 %f が使えないのかも。。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment