Last active
June 16, 2019 19:55
-
-
Save beru/d289ac9fbf0d9ba7294e0546ca6c9392 to your computer and use it in GitHub Desktop.
GR_PEACHでメモリコピーの時間測定
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// GR-PEACH mbed style project (template V2.03) | |
// main.cpp: Public domain | |
#include "mbed.h" | |
#include "Timer.h" | |
#include "dma_if.h" | |
#include "r_bsp_cmn.h" | |
#include "dcache-control.h" | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <arm_neon.h> | |
DigitalOut led1(LED1); | |
DigitalOut led2(LED2); | |
DigitalOut led3(LED3); | |
DigitalOut led4(LED4); | |
#define WIDTH 640 | |
#define HEIGHT 480 | |
uint8_t srcBuff[1024 * 1024 * 4]; | |
uint8_t dstBuff[1024 * 1024 * 4]; | |
Timer t; | |
inline | |
void copy128(uint8_t*& __restrict dst, const uint8_t*& __restrict src) | |
{ | |
__asm__ ( | |
"vldm %1!, {d0-d7}\n\t" | |
"vldm %1!, {d8-d15}\n\t" | |
"vstm %0!, {d0-d7}\n\t" | |
"vstm %0!, {d8-d15}\n\t" | |
: "=r"(dst), "=r"(src) : "0"(dst), "1"(src) | |
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", | |
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "memory" ); | |
} | |
static | |
void copyImage() | |
{ | |
const int n128 = WIDTH / 128; | |
for (int i=0; i<HEIGHT; ++i) { | |
#if 0 | |
memcpy(&dstBuff[i*WIDTH], &srcBuff[i*WIDTH], WIDTH); | |
#elif 0 | |
for (int j=0; j<WIDTH; ++j) { | |
dstBuff[i*WIDTH+j] = srcBuff[i*WIDTH+j]; | |
} | |
#elif 0 | |
const uint8_t* __restrict pSrc = &srcBuff[i*WIDTH]; | |
uint8_t* __restrict pDst = &dstBuff[i*WIDTH]; | |
int j; | |
for (j=0; j<n128; ++j) { | |
__builtin_prefetch(pSrc + 128); | |
__builtin_prefetch(pSrc + 128 + 64); | |
uint32x4x4_t v0,v1; | |
v0 = vld4q_u32((uint32_t*)pSrc); pSrc += 64; | |
v1 = vld4q_u32((uint32_t*)pSrc); pSrc += 64; | |
vst4q_u32((uint32_t*)pDst, v1); pDst += 64; | |
vst4q_u32((uint32_t*)pDst, v0); pDst += 64; | |
} | |
for (j*=128; j<WIDTH; ++j) { | |
*pDst++ = *pSrc++; | |
} | |
#elif 1 | |
const uint8_t* __restrict pSrc = &srcBuff[i*WIDTH]; | |
uint8_t* __restrict pDst = &dstBuff[i*WIDTH]; | |
int j; | |
for (j=0; j<n128; ++j) { | |
__builtin_prefetch(pSrc + 128); | |
__builtin_prefetch(pSrc + 128 + 64); | |
copy128(pDst, pSrc); | |
} | |
for (j*=128; j<WIDTH; ++j) { | |
*pDst++ = *pSrc++; | |
} | |
#endif | |
} | |
} | |
static | |
void copyData(uint8_t* __restrict pDst, | |
const uint8_t* __restrict pSrc, | |
size_t len) | |
{ | |
const int n128 = len / 128; | |
int j; | |
#if 0 | |
for (j=0; j<n128; ++j) { | |
__builtin_prefetch(pSrc + 128); | |
__builtin_prefetch(pSrc + 128 + 64); | |
uint32x4x4_t v0,v1; | |
v0 = vld4q_u32((uint32_t*)pSrc); pSrc += 64; | |
v1 = vld4q_u32((uint32_t*)pSrc); pSrc += 64; | |
vst4q_u32((uint32_t*)pDst, v1); pDst += 64; | |
vst4q_u32((uint32_t*)pDst, v0); pDst += 64; | |
} | |
#else | |
for (j=0; j<n128; ++j) { | |
__builtin_prefetch(pSrc + 128); | |
__builtin_prefetch(pSrc + 128 + 64); | |
copy128(pDst, pSrc); | |
} | |
#endif | |
for (j*=128; j<len; ++j) { | |
*pDst++ = *pSrc++; | |
} | |
} | |
static | |
void dma_callback(union sigval const param) | |
{ | |
t.stop(); | |
printf("dma_callback\r\n"); | |
} | |
static | |
int countSum(size_t len) | |
{ | |
int sum = 0; | |
for (int j=0; j<len; ++j) { | |
sum += dstBuff[j]; | |
} | |
return sum; | |
} | |
int main() { | |
printf("blinky\r\n"); | |
memset(srcBuff, 1, sizeof(srcBuff)); | |
memset(dstBuff, 0, sizeof(dstBuff)); | |
// dcache_clean(srcBuff, sizeof(srcBuff)); | |
// dcache_invalid(dstBuff, sizeof(dstBuff)); | |
int ret; | |
#if 0 | |
R_BSP_CMN_Init(); | |
#if 0 | |
dma_drv_init_t dmaInit = {0}; | |
AIOCB aiocb; | |
dmaInit.channel[0] = true; | |
dmaInit.p_aio = &aiocb; | |
ret = R_DMA_Init(&dmaInit, NULL); | |
if (ret == -1) | |
printf("R_DMA_Init failed.\r\n"); | |
#endif | |
int ch = R_DMA_Alloc(-1, NULL); | |
if (ch == -1) | |
printf("R_DMA_Alloc failed.\r\n"); | |
dma_ch_setup_t setup; | |
setup.resource = DMA_RS_OSTIM1; | |
setup.direction = DMA_REQ_SRC; | |
setup.dst_width = DMA_UNIT_128; | |
setup.src_width = DMA_UNIT_128; | |
setup.dst_cnt = DMA_ADDR_INCREMENT; | |
setup.src_cnt = DMA_ADDR_INCREMENT; | |
AIOCB aiocb; | |
aiocb.aio_sigevent.sigev_notify = SIGEV_THREAD; | |
aiocb.aio_sigevent.sigev_value.sival_int = ch; | |
aiocb.aio_sigevent.sigev_notify_function = &dma_callback; | |
setup.p_aio = &aiocb; | |
ret = R_DMA_Setup(ch, &setup, NULL); | |
if (ret == -1) | |
printf("R_DMA_Setup failed.\r\n"); | |
dma_trans_data_t data; | |
data.src_addr = srcBuff; | |
data.dst_addr = dstBuff; | |
data.count = sizeof(srcBuff); | |
t.reset(); | |
t.start(); | |
ret = R_DMA_Start(ch, &data, NULL); | |
if (ret == -1) | |
printf("R_DMA_Start failed.\r\n"); | |
wait(1.0); | |
printf("%f\r\n", t.read() * 1000.0); | |
#endif | |
while(1) { | |
#if 0 | |
led1 = 1; | |
wait(0.2); | |
led2 = 1; | |
wait(0.2); | |
led3 = 1; | |
wait(0.2); | |
led4 = 1; | |
wait(0.2); | |
led1 = 0; | |
wait(0.2); | |
led2 = 0; | |
wait(0.2); | |
led3 = 0; | |
wait(0.2); | |
led4 = 0; | |
wait(0.2); | |
#endif | |
#if 1 | |
size_t len = 64; | |
for (int i=0; i<16; ++i) { | |
t.reset(); | |
t.start(); | |
copyData(dstBuff, srcBuff, len); | |
t.stop(); | |
printf("%d\t%f\r\n", len, t.read() * 1000.0); | |
int sum = countSum(len); | |
if (sum != len) { | |
printf("invalid count. %d\r\n", sum); | |
} | |
len *= 2; | |
if (len > sizeof(dstBuff)) { | |
break; | |
} | |
} | |
printf("\r\n"); | |
#elif 0 | |
t.start(); | |
copyImage(); | |
t.stop(); | |
printf("%f\r\n", t.read() * 1000.0); | |
#else | |
int sum = countSum(sizeof(dstBuff)); | |
printf("sum : %d\r\n", sum); | |
wait(0.1); | |
#endif | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://github.com/d-kato/mbed-gr-libs/blob/cee9705b4e56aff62f3135e586b9817133635b73/R_BSP/RenesasBSP/drv_src/TARGET_RZ_A1XX/dma/dma.c#L667
新しい mbed-gr-libs のソースだと割込みの優先度を指定してる。でも Mbed OS のオーバーヘッドなんてたかがしれてるだろうしなぁ…。
ちなみに自分が mbed-gr-libs の
dma.c
ファイル中のDMA_Start
関数を改変して追加したコードは下記のようなもの。あとDMA転送完了のコールバック関数内で printf でタイマーの経過時間を出力しようとすると固まってしまう。
割込み側では書式
%f
が使えないのかも。。