Skip to content

Instantly share code, notes, and snippets.

@StrikerX3
Last active January 22, 2024 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save StrikerX3/739b1dd54aebf77d5021dec8939dced3 to your computer and use it in GitHub Desktop.
Save StrikerX3/739b1dd54aebf77d5021dec8939dced3 to your computer and use it in GitHub Desktop.
NDS ARM9 CPU cache research
// Adapted from https://github.com/blocksds/sdk/blob/master/tests/data_cache_ops/source/main.c
// Credits to AntonioND
//
// SPDX-License-Identifier: CC0-1.0
//
// SPDX-FileContributor: Antonio Niño Díaz, 2023
#include <stdio.h>
#include <string.h>
#include <fatfs.h>
#include <nds.h>
// Test for flush:
//
// - Test buffer to 0.
// - Read test buffer to load it into cache.
// - Copy from source buffer to test buffer. Writes will go to the cache because
// it has been loaded in the previous step. If not, it would have been written
// without loading it.
// - Execute DC flush. Regions that are flushed will be updated in RAM, regions
// that aren't flushed won't be updated in RAM.
// - Copy test buffer to destination buffer with DMA. DMA can't see the cache,
// so it copies the actual contents in RAM.
//
// Test for invalidate:
//
// - Test buffer to 0.
// - Read test buffer to load it into cache.
// - Copy from source buffer to test buffer. Writes will go to the cache because
// it has been loaded in the previous step. If not, it would have been written
// without loading it.
// - Execute DC invalidate. Regions that are invalidated will never be updated
// in RAM (unless they have been updated in RAM before the invalidate).
// - Copy test buffer to destination buffer without DMA. We want to see the
// parts of the buffer that are still in the cache and haven't been
// invalidated.
//
// The results screen should look like this:
//
// No operation:
// Ones: 256 0 0 | Note that the results here may vary a bit
// Twoes: 256 0 0 | because the cache is still active, we are
// | just not affecting it directly.
// Flush range:
// Ones: 128 128 0
// Twoes: 128 0 128
//
// Invalidate range:
// Ones: 128 128 0
// Twoes: 128 0 128
//
// Flush all:
// Ones: 0 256 0
// Twoes: 0 0 256
//
// Invalidate all:
// Ones: 256 0 0
// Twoes: 256 0 0
#define CACHE_LINE_SIZE 32
#define CACHE_ASSOCIATIVITY 4
#define DATA_CACHE_SIZE 4096
#define DATA_CACHE_LINE_COUNT (DATA_CACHE_SIZE/CACHE_LINE_SIZE)
#define DATA_CACHE_SET_COUNT (DATA_CACHE_LINE_COUNT/CACHE_ASSOCIATIVITY)
#define DATA_CACHE_SET_STRIDE (CACHE_LINE_SIZE*CACHE_ASSOCIATIVITY)
#define PAGE_SIZE 4096
#define BUFFER_SIZE (CACHE_LINE_SIZE * 8)
#define BIG_BUFFER_SIZE (PAGE_SIZE * 8)
ALIGN(CACHE_LINE_SIZE) volatile uint8_t buffer_source_1[BUFFER_SIZE];
ALIGN(CACHE_LINE_SIZE) volatile uint8_t buffer_source_2[BUFFER_SIZE];
ALIGN(CACHE_LINE_SIZE) volatile uint8_t buffer_test[BUFFER_SIZE];
ALIGN(CACHE_LINE_SIZE) volatile uint8_t buffer_destination[BUFFER_SIZE];
ALIGN(PAGE_SIZE) volatile uint8_t buffer_big_test[BIG_BUFFER_SIZE];
ALIGN(PAGE_SIZE) volatile uint8_t buffer_big_destination[BIG_BUFFER_SIZE];
volatile uint8_t helper;
void zero(void)
{
memset((void *)buffer_test, 0, sizeof(buffer_test));
memset((void *)buffer_destination, 0, sizeof(buffer_destination));
DC_FlushAll();
}
void copy_1(void)
{
for (int i = 0; i < sizeof(buffer_test); i += CACHE_LINE_SIZE)
helper = buffer_test[i];
memcpy((void *)buffer_test, (void *)buffer_source_1, sizeof(buffer_test));
}
void copy_2(void)
{
for (int i = 0; i < sizeof(buffer_test); i += CACHE_LINE_SIZE)
helper = buffer_test[i];
memcpy((void *)buffer_test, (void *)buffer_source_2, sizeof(buffer_test));
}
void copy_dma(void)
{
dmaCopy((void *)buffer_test, (void *)buffer_destination, sizeof(buffer_test));
}
void copy_no_dma(void)
{
memcpy((void *)buffer_destination, (void *)buffer_test, sizeof(buffer_test));
}
void count(const char *title)
{
int zeroes = 0;
int ones = 0;
int twoes = 0;
for (int i = 0; i < sizeof(buffer_destination); i++)
{
uint8_t v = buffer_destination[i];
if (v == 0)
zeroes++;
else if (v == 0x11)
ones++;
else if (v == 0x22)
twoes++;
}
printf("%s %4d %4d %4d\n", title, zeroes, ones, twoes);
}
void flush_half(void)
{
DC_FlushRange((void *)buffer_test, sizeof(buffer_test) / 2);
}
void invalidate_half(void)
{
DC_InvalidateRange((void *)buffer_test, sizeof(buffer_test) / 2);
}
void test_flush_invalidate(void)
{
consoleDemoInit();
memset((void *)buffer_source_1, 0x11, sizeof(buffer_source_1));
memset((void *)buffer_source_2, 0x22, sizeof(buffer_source_2));
uint32_t ticks;
printf("No operation:\n");
DC_FlushAll();
cpuStartTiming(0);
{
zero();
copy_1();
copy_dma();
count(" Ones: ");
zero();
copy_2();
copy_dma();
count(" Twoes: ");
}
ticks = cpuEndTiming();
printf(" t=%lu", ticks);
printf("Flush range:\n");
DC_FlushAll();
cpuStartTiming(0);
{
zero();
copy_1();
flush_half();
copy_dma();
count(" Ones: ");
zero();
copy_2();
flush_half();
copy_dma();
count(" Twoes: ");
}
ticks = cpuEndTiming();
printf(" t=%lu", ticks);
printf("Invalidate range:\n");
DC_FlushAll();
cpuStartTiming(0);
{
zero();
copy_1();
invalidate_half();
copy_no_dma();
count(" Ones: ");
zero();
copy_2();
invalidate_half();
copy_no_dma();
count(" Twoes: ");
}
ticks = cpuEndTiming();
printf(" t=%lu", ticks);
printf("Flush all:\n");
DC_FlushAll();
cpuStartTiming(0);
{
zero();
copy_1();
DC_FlushAll();
copy_dma();
count(" Ones: ");
zero();
copy_2();
DC_FlushAll();
copy_dma();
count(" Twoes: ");
}
ticks = cpuEndTiming();
printf(" t=%lu", ticks);
printf("Invalidate all:\n");
DC_FlushAll();
cpuStartTiming(0);
{
zero();
copy_1();
DC_InvalidateAll();
copy_no_dma();
count(" Ones: ");
zero();
copy_2();
DC_InvalidateAll();
copy_no_dma();
count(" Twoes: ");
}
ticks = cpuEndTiming();
printf(" t=%lu", ticks);
printf("\n");
printf("Press START to exit\n");
fflush(stdout);
while (1)
{
swiWaitForVBlank();
scanKeys();
if (keysHeld() & KEY_START)
break;
}
}
// ------------------------------------------------------------------
void set_debug_reg(uint32_t value)
{
asm volatile("mcr p15, 3, %0, c15, c0, 0" : : "r"(value));
}
uint32_t read_code_tag_ram(uint32_t index, uint32_t segment)
{
uint32_t reg = ((index & 63) << 5) | ((segment & 3) << 30);
set_debug_reg(reg);
uint32_t value;
asm volatile("mrc p15, 3, %0, c15, c1, 0" : "=r"(value));
return value;
}
uint32_t read_data_tag_ram(uint32_t index, uint32_t segment)
{
uint32_t reg = ((index & 31) << 5) | ((segment & 3) << 30);
set_debug_reg(reg);
uint32_t value;
asm volatile("mrc p15, 3, %0, c15, c2, 0" : "=r"(value));
return value;
}
uint32_t read_code_cache_ram(uint32_t index, uint32_t segment, uint32_t word)
{
uint32_t reg = ((index & 63) << 5) | ((segment & 3) << 30) | ((word & 7) << 2);
set_debug_reg(reg);
uint32_t value;
asm volatile("mrc p15, 3, %0, c15, c3, 0" : "=r"(value));
return value;
}
uint32_t read_data_cache_ram(uint32_t index, uint32_t segment, uint32_t word)
{
uint32_t reg = ((index & 31) << 5) | ((segment & 3) << 30) | ((word & 7) << 2);
set_debug_reg(reg);
uint32_t value;
asm volatile("mrc p15, 3, %0, c15, c4, 0" : "=r"(value));
return value;
}
void write_code_tag_ram(uint32_t index, uint32_t segment, uint32_t value)
{
uint32_t reg = ((index & 63) << 5) | ((segment & 3) << 30);
set_debug_reg(reg);
asm volatile("mcr p15, 3, %0, c15, c1, 0" : : "r"(value));
}
void write_data_tag_ram(uint32_t index, uint32_t segment, uint32_t value)
{
uint32_t reg = ((index & 31) << 5) | ((segment & 3) << 30);
set_debug_reg(reg);
asm volatile("mcr p15, 3, %0, c15, c2, 0" : : "r"(value));
}
void write_code_cache_ram(uint32_t index, uint32_t segment, uint32_t word, uint32_t value)
{
uint32_t reg = ((index & 63) << 5) | ((segment & 3) << 30) | ((word & 7) << 2);
set_debug_reg(reg);
asm volatile("mcr p15, 3, %0, c15, c3, 0" : : "r"(value));
}
void write_data_cache_ram(uint32_t index, uint32_t segment, uint32_t word, uint32_t value)
{
uint32_t reg = ((index & 31) << 5) | ((segment & 3) << 30) | ((word & 7) << 2);
set_debug_reg(reg);
asm volatile("mcr p15, 3, %0, c15, c4, 0" : : "r"(value));
}
// ------------------------------------------------------------------
void print_raw_code_tag_ram(uint32_t offset)
{
printf("\x1b[0;0H");
for (uint32_t index = 0; index < 16; index++)
{
for (uint32_t segment = 0; segment < 4; segment++)
{
uint32_t tag = read_code_tag_ram(index + offset, segment);
printf("\x1b[37;%lum%08lX", segment & 1, tag);
}
}
}
void print_raw_data_tag_ram(uint32_t offset)
{
printf("\x1b[0;0H");
for (uint32_t index = 0; index < 16; index++)
{
for (uint32_t segment = 0; segment < 4; segment++)
{
uint32_t tag = read_data_tag_ram(index + offset, segment);
printf("\x1b[37;%lum%08lX", segment & 1, tag);
}
}
}
void print_raw_code_cache_ram(uint32_t offset)
{
printf("\x1b[0;0H");
for (uint32_t index = 0; index < 16; index++)
{
for (uint32_t segment = 0; segment < 4; segment++)
{
for (uint32_t word = 0; word < 7; word++)
{
uint32_t value = read_code_cache_ram(index + offset, segment, word);
printf("\x1b[37;%lum%08lX", segment & 1, value);
}
}
}
}
void print_raw_data_cache_ram(uint32_t offset)
{
printf("\x1b[0;0H");
for (uint32_t index = 0; index < 16; index++)
{
for (uint32_t segment = 0; segment < 4; segment++)
{
for (uint32_t word = 0; word < 7; word++)
{
uint32_t value = read_data_cache_ram(index + offset, segment, word);
printf("\x1b[37;%lum%08lX", segment & 1, value);
}
}
}
}
void test_raw_debug_regs()
{
consoleDemoInit();
bool code = false;
bool tag = false;
uint32_t offset = 0;
while (1)
{
swiWaitForVBlank();
if (code)
{
if (offset > 32)
offset = 32;
if (tag)
print_raw_code_tag_ram(offset);
else
print_raw_code_cache_ram(offset);
}
else
{
if (offset > 16)
offset = 16;
if (tag)
print_raw_data_tag_ram(offset);
else
print_raw_data_cache_ram(offset);
}
printf("\n");
printf("[Up/Down] Range: %lu..%lu \n", offset, offset + 15);
printf("[X] View %s cache\n", code ? "<Code>/Data" : "Code/<Data>");
printf("[Y] View %s RAM\n", tag ? "<TAG>/Cache" : "TAG/<Cache>");
printf("[START] Exit");
scanKeys();
if (keysHeld() & KEY_UP)
if (offset > 0)
--offset;
if (keysHeld() & KEY_DOWN)
++offset;
if (keysDown() & KEY_X)
code = !code;
if (keysDown() & KEY_Y)
tag = !tag;
if (keysHeld() & KEY_START)
break;
}
}
// ------------------------------------------------------------------
void print_code_cache_debug(uint32_t offset)
{
printf("\x1b[0;0H\x1b[37;3m");
printf("St I W V DD TAG Data\n");
for (uint32_t row = 0; row < 16; row++)
{
uint32_t index = (row + offset) >> 4;
uint32_t segment = ((row + offset) >> 2) & 3;
uint32_t word = (row + offset) & 3;
uint32_t tag = read_code_tag_ram(index, segment);
uint32_t data = read_code_cache_ram(index, segment, word);
bool valid = (tag >> 4) & 1;
bool dirty1 = (tag >> 2) & 1;
bool dirty2 = (tag >> 3) & 1;
uint32_t tagAddr = tag & 0xFFFFF800;
printf("\x1b[37;%lum", segment & 1);
printf("%02lu %lu %lu %c %c%c %08lX = %08lX\n",
index,
segment,
word,
valid ? 'V' : '.',
dirty1 ? 'D' : '.',
dirty2 ? 'D' : '.',
tagAddr,
data
);
}
printf("\x1b[37;3m");
}
void print_data_cache_debug(uint32_t offset)
{
printf("\x1b[0;0H\x1b[37;3m");
printf("St I W V DD TAG Data\n");
for (uint32_t row = 0; row < 16; row++)
{
uint32_t index = (row + offset) >> 4;
uint32_t segment = ((row + offset) >> 2) & 3;
uint32_t word = (row + offset) & 3;
uint32_t tag = read_data_tag_ram(index, segment);
uint32_t data = read_data_cache_ram(index, segment, word);
bool valid = (tag >> 4) & 1;
bool dirty1 = (tag >> 2) & 1;
bool dirty2 = (tag >> 3) & 1;
uint32_t tagAddr = tag & 0xFFFFFC00;
printf("\x1b[37;%lum", segment & 1);
printf("%02lu %lu %lu %c %c%c %08lX = %08lX\n",
index,
segment,
word,
valid ? 'V' : '.',
dirty1 ? 'D' : '.',
dirty2 ? 'D' : '.',
tagAddr,
data
);
}
printf("\x1b[37;3m");
}
void test_debug_regs()
{
consoleDemoInit();
bool code = false;
uint32_t offset = 0;
keysSetRepeat(0, 0);
while (1)
{
swiWaitForVBlank();
if (code)
{
if (offset > 64*4*4-16)
offset = 64*4*4-16;
print_code_cache_debug(offset);
}
else
{
if (offset > 32*4*4-16)
offset = 32*4*4-16;
print_data_cache_debug(offset);
}
printf("\n");
printf("[Up/Down] Up/down by 1\n");
printf("[Left/Right] Up/down by 4\n");
printf("[L/R] Up/down by 16\n");
printf("[X] View %s cache\n", code ? "<Code>/Data" : "Code/<Data>");
//printf("[A/B] Test code/data TAG write\n");
printf("[START] Exit");
scanKeys();
if (keysHeld() & KEY_UP)
if (offset > 0)
--offset;
if (keysHeld() & KEY_DOWN)
++offset;
if (keysHeld() & KEY_LEFT)
{
if (offset >= 4)
offset -= 4;
else
offset = 0;
}
if (keysHeld() & KEY_RIGHT)
offset += 4;
if (keysHeld() & KEY_L)
{
if (offset >= 16)
offset -= 16;
else
offset = 0;
}
if (keysHeld() & KEY_R)
offset += 16;
if (keysDown() & KEY_X)
code = !code;
/*if (keysDown() & KEY_A)
{
uint32_t tag = read_code_tag_ram(2, 0);
tag |= 0b11 << 2; // set dirty bits
tag &= ~(1 << 4); // clear valid bit
tag &= ~(3 | (63 << 5)); // clear set and index
tag |= 3 | (1 << 5); // modify set and index for the test
write_code_tag_ram(2, 0, tag); // will it use set/index 2,0 or 1,3?
}
if (keysDown() & KEY_B)
{
uint32_t tag = read_data_tag_ram(2, 0);
tag |= 0b11 << 2; // set dirty bits
tag &= ~(1 << 4); // clear valid bit
tag &= ~(3 | (31 << 5)); // clear set and index
tag |= 3 | (1 << 5); // modify set and index for the test
write_data_tag_ram(2, 0, tag); // will it use set/index 2,0 or 1,3?
}*/
if (keysHeld() & KEY_START)
break;
}
}
// ------------------------------------------------------------------
void test_pu_cache_bits()
{
consoleDemoInit();
uint32_t ctl;
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
printf("CP15 ctl = %08lX\n", ctl); // 0005307D
// disable PU, both caches and write buffer
//ctl &= ~((1 << 0) | (1 << 2) | (1 << 3) | (1 << 12));
ctl = 0x52070;
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));
printf("\x1b[1;0HDisabled PU, caches, WB\n");
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
printf("\x1b[2;0HCP15 ctl = %08lX\n", ctl);
// try enabling both caches without PU
//ctl |= (1 << 2) | (1 << 12);
ctl = 0x53074;
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));
printf("\x1b[3;0HEnabled caches\n");
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
printf("\x1b[4;0HCP15 ctl = %08lX\n", ctl);
// try enabling write buffer without PU
//ctl |= (1 << 3);
ctl = 0x5307C;
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));
printf("\x1b[5;0HEnabled write buffer\n");
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
printf("\x1b[6;0HCP15 ctl = %08lX\n", ctl);
// enable PU
//ctl |= (1 << 0);
ctl = 0x5307D;
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));
printf("\x1b[7;0HEnabled PU\n");
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
printf("\x1b[8;0HCP15 ctl = %08lX\n", ctl);
printf("\x1b[10;0HPress START to exit\n");
fflush(stdout);
while (1)
{
swiWaitForVBlank();
scanKeys();
if (keysHeld() & KEY_START)
break;
}
}
// ------------------------------------------------------------------
void test_dirty_cache_read()
{
consoleDemoInit();
uint32_t cachability;
asm volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cachability));
printf("\x1b[0;0HCachability = %08lX\n", cachability);
cachability = 0;
printf("\x1b[1;0HBefore init: %02X %02X\n", helper, buffer_test[0]);
helper = 0xAA;
buffer_test[0] = 0xBB;
DC_FlushAll();
printf("\x1b[2;0HAfter flush: %02X %02X\n", helper, buffer_test[0]);
buffer_test[0] = helper;
helper = 0xCC;
printf("\x1b[3;0HAfter writes: %02X %02X\n", helper, buffer_test[0]);
asm volatile("mcr p15, 0, %0, c2, c0, 0" : : "r"(cachability));
printf("\x1b[4;0HCachable off: %02X %02X\n", helper, buffer_test[0]);
DC_FlushAll();
printf("\x1b[5;0HAfter flush: %02X %02X\n", helper, buffer_test[0]);
printf("\x1b[7;0HPress START to exit\n");
fflush(stdout);
while (1)
{
swiWaitForVBlank();
scanKeys();
if (keysHeld() & KEY_START)
break;
}
}
// ------------------------------------------------------------------
void print_pu_regions()
{
uint32_t pu[8];
uint32_t dataCachability;
uint32_t codeCachability;
uint32_t bufferability;
asm volatile("mrc p15, 0, %0, c6, c0, 0" : "=r"(pu[0]));
asm volatile("mrc p15, 0, %0, c6, c1, 0" : "=r"(pu[1]));
asm volatile("mrc p15, 0, %0, c6, c2, 0" : "=r"(pu[2]));
asm volatile("mrc p15, 0, %0, c6, c3, 0" : "=r"(pu[3]));
asm volatile("mrc p15, 0, %0, c6, c4, 0" : "=r"(pu[4]));
asm volatile("mrc p15, 0, %0, c6, c5, 0" : "=r"(pu[5]));
asm volatile("mrc p15, 0, %0, c6, c6, 0" : "=r"(pu[6]));
asm volatile("mrc p15, 0, %0, c6, c7, 0" : "=r"(pu[7]));
asm volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(dataCachability));
asm volatile("mrc p15, 0, %0, c2, c0, 1" : "=r"(codeCachability));
asm volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(bufferability));
auto puBaseAddr = [](uint32_t pu) {
return pu & 0xFFFFF000;
};
auto puSize = [](uint32_t pu) {
return 2 << ((pu >> 1) & 0x1F);
};
for (uint32_t i = 0; i < 8; i++) {
bool enable = pu[i] & 1;
if (enable) {
uint32_t baseAddr = puBaseAddr(pu[i]);
uint32_t size = puSize(pu[i]);
bool codeCachable = (codeCachability >> i) & 1;
bool dataCachable = (dataCachability >> i) & 1;
bool bufferable = (bufferability >> i) & 1;
printf("%lu: %08lX..%08lX%s%s%s\n", i, baseAddr, baseAddr + size - 1,
codeCachable ? " cc" : "",
dataCachable ? " dc" : "",
bufferable ? " wb" : ""
);
} else {
printf("%lu: (disabled)\n", i);
}
}
}
void shift_pu_regions()
{
uint32_t pu[8];
uint32_t dataCachability;
uint32_t codeCachability;
uint32_t bufferability;
asm volatile("mrc p15, 0, %0, c6, c3, 0" : "=r"(pu[3]));
asm volatile("mrc p15, 0, %0, c6, c4, 0" : "=r"(pu[4]));
asm volatile("mrc p15, 0, %0, c6, c5, 0" : "=r"(pu[5]));
asm volatile("mrc p15, 0, %0, c6, c6, 0" : "=r"(pu[6]));
asm volatile("mrc p15, 0, %0, c6, c7, 0" : "=r"(pu[7]));
asm volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(dataCachability));
asm volatile("mrc p15, 0, %0, c2, c0, 1" : "=r"(codeCachability));
asm volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(bufferability));
// Move PU regions 4..7 down to 3..6 to make room for a high-priority PU region we can manipulate at will.
// On BlocksDS, region 3 is reserved for the GBA cart on regular DS or the switchable IWRAM on DSi,
// neither of which we're using here.
// Also make everything non-data-cachable and nonbufferable and the new region 7 code-cachable.
auto adjust = [](uint32_t bits) {
uint32_t fixed = bits & 0b111;
uint32_t shifted = (bits & 0b11110000) >> 1;
return fixed | shifted;
};
dataCachability = 0;
codeCachability = adjust(codeCachability);
bufferability = 0;
codeCachability |= (1 << 7);
DC_FlushAll();
asm volatile("mcr p15, 0, %0, c2, c0, 0" : : "r"(dataCachability));
asm volatile("mcr p15, 0, %0, c2, c0, 1" : : "r"(codeCachability));
asm volatile("mcr p15, 0, %0, c3, c0, 0" : : "r"(bufferability));
asm volatile("mcr p15, 0, %0, c6, c3, 0" : : "r"(pu[4]));
asm volatile("mcr p15, 0, %0, c6, c4, 0" : : "r"(pu[5]));
asm volatile("mcr p15, 0, %0, c6, c5, 0" : : "r"(pu[6]));
asm volatile("mcr p15, 0, %0, c6, c6, 0" : : "r"(pu[7]));
// Make PU region 7 contain the page buffer
assert(((uintptr_t)buffer_big_test & 0xFFF) == 0);
assert((sizeof(buffer_big_test) & 0xFFF) == 0);
assert(__builtin_popcount(sizeof(buffer_big_test)) == 1);
uintptr_t base = (uintptr_t)buffer_big_test;
uintptr_t size = sizeof(buffer_big_test);
uint32_t sizeVal = __builtin_ctz(size) - 1;
pu[7] = base | (sizeVal << 1) | 1;
DC_FlushAll();
asm volatile("mcr p15, 0, %0, c6, c7, 0" : : "r"(pu[7]));
asm volatile("mcr p15, 0, %0, c2, c0, 0" : : "r"(dataCachability));
asm volatile("mcr p15, 0, %0, c2, c0, 1" : : "r"(codeCachability));
asm volatile("mcr p15, 0, %0, c3, c0, 0" : : "r"(bufferability));
}
void set_pu7_data_cachable(bool cachable)
{
uint32_t dataCachability;
asm volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(dataCachability));
if (cachable)
dataCachability |= (1 << 7);
else
dataCachability &= ~(1 << 7);
asm volatile("mcr p15, 0, %0, c2, c0, 0" : : "r"(dataCachability));
}
void set_pu7_bufferable(bool bufferable)
{
uint32_t bufferability;
asm volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(bufferability));
if (bufferable)
bufferability |= (1 << 7);
else
bufferability &= ~(1 << 7);
asm volatile("mcr p15, 0, %0, c3, c0, 0" : : "r"(bufferability));
}
void set_replacement_strategy(bool roundRobin)
{
uint32_t ctl;
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
if (roundRobin)
ctl |= (1 << 14);
else
ctl &= ~(1 << 14);
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));
}
ITCM_CODE
void run_replacement_loop()
{
uintptr_t addr = (uintptr_t)buffer_big_test;
uint32_t set = (addr >> 5) & 31;
int oldIME = enterCriticalSection();
set_replacement_strategy(false);
DC_FlushAll();
// Disable and reenable data cache
/*uint32_t ctl;
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
ctl &= ~(1 << 2);
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));
ctl |= (1 << 2);
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));*/
// Get CP15 control register to manipulate during the loop
uint32_t ctl;
asm volatile("mrc p15, 0, %0, c1, c0, 0" : "=r"(ctl));
// Set up 512 KB of VRAM for ARM9 access and clear VRAM
uint32_t *vramData = (uint32_t*)0x6800000;
videoSetMode(MODE_FB0);
vramSetBankA(VRAM_A_LCD);
vramSetBankB(VRAM_B_LCD);
vramSetBankC(VRAM_C_LCD);
vramSetBankD(VRAM_D_LCD);
for (uint32_t i = 0; i < 512*1024 / sizeof(uint32_t); i++) {
vramData[i] = 0;
}
// Abuse data cache lockdown to reset the counter (doesn't work with random replacement strategy)
/*asm volatile("mcr p15, 0, %0, c9, c0, 0" : : "r"(0 | (1 << 31)));
helper = buffer_big_test[1 * DATA_CACHE_SIZE];
asm volatile("mcr p15, 0, %0, c9, c0, 0" : : "r"(1));
asm volatile("mcr p15, 0, %0, c9, c0, 0" : : "r"(0));
DC_InvalidateAll();*/
//asm volatile("mcr p15, 0, %0, c9, c0, 0" : : "r"(0));
constexpr uint32_t total = 512*1024 * 8/2;
//constexpr uint32_t total = 1*1024 * 8/2;
for (uint32_t i = 0; i < total; i++)
{
// Lockdown cache lines
/*constexpr uint32_t linesToLock = 1;
for (uint32_t line = 0; line < linesToLock; line++)
{
asm volatile("mcr p15, 0, %0, c9, c0, 0" : : "r"(line | (1 << 31)));
helper = buffer_big_test[1 * DATA_CACHE_SIZE];
}
asm volatile("mcr p15, 0, %0, c9, c0, 0" : : "r"(linesToLock));*/
// Mess with the data cache lockdown register
//asm volatile("mcr p15, 0, %0, c9, c0, 0" : : "r"((i >> 7) & 0b11));
// Flip between round-robin and random every so often
/*ctl &= ~(1 << 14);
ctl |= ((i >> 5) & 1) << 14;
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));*/
// Switch to round-robin for one iteration after 0x7FF random entries
ctl &= ~(1 << 14);
ctl |= ((i & 0x7FF) == 0 ? 1 : 0) << 14;
asm volatile("mcr p15, 0, %0, c1, c0, 0" : : "r"(ctl));
helper = buffer_big_test[0];
for (int j = 0; j < 4; j++) {
uint32_t reg = ((set & 31) << 5) | ((j & 3) << 30);
asm volatile("mcr p15, 3, %0, c15, c0, 0" : : "r"(reg));
uint32_t tag;
asm volatile("mrc p15, 3, %0, c15, c2, 0" : "=r"(tag));
uint32_t valid = (tag >> 4) & 1;
uint32_t value = valid * j;
uint32_t pos = i * 2;
uint32_t wordIndex = pos / 32;
uint32_t shift = pos & 31;
vramData[wordIndex] |= value << shift;
}
DC_InvalidateAll();
}
leaveCriticalSection(oldIME);
}
void test_replacement_strategy()
{
consoleDemoInit();
//uintptr_t addr = (uintptr_t)buffer_big_test;
//uint32_t set = (addr >> 5) & 31;
/*auto printSet = [](uint32_t set) {
//printf("St W V DD TAG Data\n");
for (uint32_t segment = 0; segment < CACHE_ASSOCIATIVITY; segment++)
{
uint32_t tag = read_data_tag_ram(set, segment);
uint32_t data = read_data_cache_ram(set, segment);
bool valid = (tag >> 4) & 1;
bool dirty1 = (tag >> 2) & 1;
bool dirty2 = (tag >> 3) & 1;
uint32_t tagAddr = tag & 0xFFFFFC00;
printf("%02lu %lu %c %c%c %08lX = %08lX\n",
set,
segment,
valid ? 'V' : '.',
dirty1 ? 'D' : '.',
dirty2 ? 'D' : '.',
tagAddr,
data
);
}
};*/
shift_pu_regions();
//print_pu_regions();
// Fill the big buffer with a recognizable pattern
for (uint32_t i = 0; i < sizeof(buffer_big_test); i += 2)
{
*reinterpret_cast<volatile uint16_t *>(&buffer_big_test[i]) = i;
}
// Configure region
set_pu7_data_cachable(true);
set_pu7_bufferable(false);
//printf("\n");
//print_pu_regions();
/*printf("Round-robin\n");
set_replacement_strategy(true);
DC_FlushAll();
// Read data from the buffer to load it in the cache
for (uint32_t i = 0; i < 8; i++)
{
helper = buffer_big_test[i * DATA_CACHE_SIZE];
helper = buffer_big_test[i * DATA_CACHE_SIZE + CACHE_LINE_SIZE];
printf("\n");
printSet(set);
printSet(set + 1);
// Wait for the user to press the A button
printf("Press A to continue\n");
scanKeys();
while ((keysDown() & KEY_A) == 0) { scanKeys(); }
}
printf("\n");*/
fatInitDefault();
int k = 0;
//for (int k = 0; k < 3; k++) {
// Run the test
run_replacement_loop();
// Dump contents from VRAM into a file
videoSetMode(MODE_FB3);
char filename[13];
sprintf(filename, "randseq%d.bin", k);
FILE *fp = fopen(filename, "wb");
if (fp != nullptr) {
uint32_t value;
for (uint32_t i = 0; i < 512*1024; i+=4) {
//for (uint32_t i = 0; i < 1*1024; i+=4) {
value = *(uint32_t*)(0x6800000 + i);
fwrite(&value, sizeof(uint32_t), 1, fp);
}
}
//uintptr_t ptr = (uintptr_t)buffer_big_test;
//fwrite(&ptr, sizeof(ptr), 1, fp);
fflush(fp);
fclose(fp);
//}
consoleDemoInit();
printf("Done\n");
printf("Press START to exit\n");
while (1)
{
swiWaitForVBlank();
scanKeys();
if (keysDown() & KEY_START)
break;
}
}
int main(int argc, char **argv)
{
/*memset((void *)buffer_source_1, 0x11, sizeof(buffer_source_1));
memset((void *)buffer_source_2, 0x22, sizeof(buffer_source_2));
zero();
copy_1();*/
//test_flush_invalidate();
//test_raw_debug_regs();
//test_debug_regs();
//test_pu_cache_bits();
//test_dirty_cache_read();
test_replacement_strategy();
return 0;
}
# SPDX-License-Identifier: CC0-1.0
#
# SPDX-FileContributor: Antonio Niño Díaz, 2023
# User config
NAME := data_cache_ops
GAME_TITLE := Data cache ops test
# Source code paths
SOURCEDIRS := source
include Makefile.include
#---------------------------------------------------------------------------------
.SUFFIXES:
#---------------------------------------------------------------------------------
ifeq ($(strip $(DEVKITARM)),)
$(error "Please set DEVKITARM in your environment. export DEVKITARM=<path to>devkitARM")
endif
include $(DEVKITARM)/ds_rules
#---------------------------------------------------------------------------------
# TARGET is the name of the output
# BUILD is the directory where object files & intermediate files will be placed
# SOURCES is a list of directories containing source code
# INCLUDES is a list of directories containing extra header files
#---------------------------------------------------------------------------------
TARGET := cache-test
BUILD := build
SOURCES := source
DATA := data
INCLUDES := include
#---------------------------------------------------------------------------------
# options for code generation
#---------------------------------------------------------------------------------
ARCH := -mthumb -mthumb-interwork -march=armv5te -mtune=arm946e-s
CFLAGS := -g -Wall -O2\
-fomit-frame-pointer\
-ffast-math \
$(ARCH)
CFLAGS += $(INCLUDE) -DARM9
CXXFLAGS := $(CFLAGS) -fno-rtti -fno-exceptions
ASFLAGS := -g $(ARCH)
LDFLAGS = -specs=ds_arm9.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map)
#---------------------------------------------------------------------------------
# any extra libraries we wish to link with the project
#---------------------------------------------------------------------------------
LIBS := -lfilesystem -lfat -lnds9
#---------------------------------------------------------------------------------
# list of directories containing libraries, this must be the top level containing
# include and lib
#---------------------------------------------------------------------------------
LIBDIRS := $(LIBNDS)
#---------------------------------------------------------------------------------
# no real need to edit anything past this point unless you need to add additional
# rules for different file extensions
#---------------------------------------------------------------------------------
ifneq ($(BUILD),$(notdir $(CURDIR)))
#---------------------------------------------------------------------------------
export OUTPUT := $(CURDIR)/$(TARGET)
export VPATH := $(foreach dir,$(SOURCES),$(CURDIR)/$(dir)) \
$(foreach dir,$(DATA),$(CURDIR)/$(dir))
export DEPSDIR := $(CURDIR)/$(BUILD)
CFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c)))
CPPFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp)))
SFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s)))
BINFILES := $(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*)))
#---------------------------------------------------------------------------------
# use CXX for linking C++ projects, CC for standard C
#---------------------------------------------------------------------------------
ifeq ($(strip $(CPPFILES)),)
#---------------------------------------------------------------------------------
export LD := $(CC)
#---------------------------------------------------------------------------------
else
#---------------------------------------------------------------------------------
export LD := $(CXX)
#---------------------------------------------------------------------------------
endif
#---------------------------------------------------------------------------------
export OFILES := $(addsuffix .o,$(BINFILES)) \
$(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o)
export INCLUDE := $(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \
$(foreach dir,$(LIBDIRS),-I$(dir)/include) \
$(foreach dir,$(LIBDIRS),-I$(dir)/include) \
-I$(CURDIR)/$(BUILD)
export LIBPATHS := $(foreach dir,$(LIBDIRS),-L$(dir)/lib)
.PHONY: $(BUILD) clean
#---------------------------------------------------------------------------------
$(BUILD):
@[ -d $@ ] || mkdir -p $@
@$(MAKE) --no-print-directory -C $(BUILD) -f $(CURDIR)/Makefile
#---------------------------------------------------------------------------------
clean:
@echo clean ...
@rm -fr $(BUILD) $(TARGET).elf $(TARGET).nds $(TARGET).ds.gba
#---------------------------------------------------------------------------------
else
DEPENDS := $(OFILES:.o=.d)
#---------------------------------------------------------------------------------
# main targets
#---------------------------------------------------------------------------------
$(OUTPUT).nds : $(OUTPUT).elf
$(OUTPUT).elf : $(OFILES)
#---------------------------------------------------------------------------------
%.bin.o : %.bin
#---------------------------------------------------------------------------------
@echo $(notdir $<)
@$(bin2o)
-include $(DEPENDS)
#---------------------------------------------------------------------------------------
endif
#---------------------------------------------------------------------------------------
# SPDX-License-Identifier: CC0-1.0
#
# SPDX-FileContributor: Antonio Niño Díaz, 2023
BLOCKSDS ?= /opt/blocksds/core
BLOCKSDSEXT ?= /opt/blocksds/external
# User config
# ===========
GAME_SUBTITLE := Built with BlocksDS
GAME_AUTHOR := github.com/blocksds/sdk
GAME_ICON := $(BLOCKSDS)/sys/icon.bmp
# DLDI and internal SD slot of DSi
# --------------------------------
# Root folder of the SD image
SDROOT := sdroot
# Name of the generated image it "DSi-1.sd" for no$gba in DSi mode
SDIMAGE := image.bin
# Libraries
# ---------
LIBS ?= -lnds9 -lc
LIBDIRS += $(BLOCKSDS)/libs/libnds
# Build artifacts
# ---------------
BUILDDIR := build
ELF := build/$(NAME).elf
DUMP := build/$(NAME).dump
NITROFAT_IMG := build/nitrofat.bin
MAP := build/$(NAME).map
SOUNDBANKDIR := $(BUILDDIR)/maxmod
ROM := $(NAME).nds
# Tools
# -----
PREFIX := arm-none-eabi-
CC := $(PREFIX)gcc
CXX := $(PREFIX)g++
OBJDUMP := $(PREFIX)objdump
MKDIR := mkdir
RM := rm -rf
# Verbose flag
# ------------
ifeq ($(VERBOSE),1)
V :=
else
V := @
endif
# Source files
# ------------
ifneq ($(BINDIRS),)
SOURCES_BIN := $(shell find -L $(BINDIRS) -name "*.bin")
INCLUDEDIRS += $(addprefix $(BUILDDIR)/,$(BINDIRS))
endif
ifneq ($(GFXDIRS),)
SOURCES_PNG := $(shell find -L $(GFXDIRS) -name "*.png")
INCLUDEDIRS += $(addprefix $(BUILDDIR)/,$(GFXDIRS))
endif
ifneq ($(AUDIODIRS),)
SOURCES_AUDIO := $(shell find -L $(AUDIODIRS) -regex '.*\.\(it\|mod\|s3m\|wav\|xm\)')
ifneq ($(SOURCES_AUDIO),)
INCLUDEDIRS += $(SOUNDBANKDIR)
endif
endif
SOURCES_S := $(shell find -L $(SOURCEDIRS) -name "*.s")
SOURCES_C := $(shell find -L $(SOURCEDIRS) -name "*.c")
SOURCES_CPP := $(shell find -L $(SOURCEDIRS) -name "*.cpp")
# Compiler and linker flags
# -------------------------
DEFINES += -D__NDS__ -DARM9
ARCH := -march=armv5te -mtune=arm946e-s
WARNFLAGS := -Wall
ifeq ($(SOURCES_CPP),)
LD := $(CC)
else
LD := $(CXX)
endif
INCLUDEFLAGS := $(foreach path,$(INCLUDEDIRS),-I$(path)) \
$(foreach path,$(LIBDIRS),-I$(path)/include)
LIBDIRSFLAGS := $(foreach path,$(LIBDIRS),-L$(path)/lib)
ASFLAGS += -x assembler-with-cpp $(DEFINES) $(ARCH) \
-marm -mthumb-interwork $(INCLUDEFLAGS) \
-ffunction-sections -fdata-sections
CFLAGS += -std=gnu11 $(WARNFLAGS) $(DEFINES) $(ARCH) \
-marm -mthumb-interwork $(INCLUDEFLAGS) -O2 \
-ffunction-sections -fdata-sections \
-fomit-frame-pointer
CXXFLAGS += -std=gnu++14 $(WARNFLAGS) $(DEFINES) $(ARCH) \
-marm -mthumb-interwork $(INCLUDEFLAGS) -O2 \
-ffunction-sections -fdata-sections \
-fno-exceptions -fno-rtti \
-fomit-frame-pointer
LDFLAGS := -marm -mthumb-interwork $(LIBDIRSFLAGS) \
-Wl,-Map,$(MAP) -Wl,--gc-sections -nostdlib \
-T$(BLOCKSDS)/sys/crts/ds_arm9.mem \
-T$(BLOCKSDS)/sys/crts/ds_arm9.ld \
-Wl,--no-warn-rwx-segments \
-Wl,--start-group $(LIBS) -lgcc -Wl,--end-group
# Intermediate build files
# ------------------------
OBJS_ASSETS := $(addsuffix .o,$(addprefix $(BUILDDIR)/,$(SOURCES_BIN))) \
$(addsuffix .o,$(addprefix $(BUILDDIR)/,$(SOURCES_PNG)))
HEADERS_ASSETS := $(patsubst %.bin,%_bin.h,$(addprefix $(BUILDDIR)/,$(SOURCES_BIN))) \
$(patsubst %.png,%.h,$(addprefix $(BUILDDIR)/,$(SOURCES_PNG)))
ifneq ($(SOURCES_AUDIO),)
OBJS_ASSETS += $(SOUNDBANKDIR)/soundbank.c.o
HEADERS_ASSETS += $(SOUNDBANKDIR)/soundbank.h
endif
OBJS_SOURCES := $(addsuffix .o,$(addprefix $(BUILDDIR)/,$(SOURCES_S))) \
$(addsuffix .o,$(addprefix $(BUILDDIR)/,$(SOURCES_C))) \
$(addsuffix .o,$(addprefix $(BUILDDIR)/,$(SOURCES_CPP)))
OBJS := $(OBJS_ASSETS) $(OBJS_SOURCES)
DEPS := $(OBJS:.o=.d)
# Targets
# -------
.PHONY: all clean dump dldipatch sdimage
all: $(ROM)
ifneq ($(strip $(NITROFATDIR)),)
# Additional arguments for ndstool
NDSTOOL_FAT := -F $(NITROFAT_IMG)
$(NITROFAT_IMG): $(NITROFATDIR)
@echo " MKFATIMG $@ $(NITROFATDIR)"
$(V)$(BLOCKSDS)/tools/mkfatimg/mkfatimg -t $(NITROFATDIR) $@ 0
# Make the NDS ROM depend on the filesystem image only if it is needed
$(ROM): $(NITROFAT_IMG)
endif
# Combine the title strings
ifeq ($(strip $(GAME_SUBTITLE)),)
GAME_FULL_TITLE := $(GAME_TITLE);$(GAME_AUTHOR)
else
GAME_FULL_TITLE := $(GAME_TITLE);$(GAME_SUBTITLE);$(GAME_AUTHOR)
endif
$(ROM): $(ELF)
@echo " NDSTOOL $@"
$(V)$(BLOCKSDS)/tools/ndstool/ndstool -c $@ \
-7 $(BLOCKSDS)/sys/default_arm7/arm7.elf -9 $(ELF) \
-b $(GAME_ICON) "$(GAME_FULL_TITLE)" \
$(NDSTOOL_FAT)
$(ELF): $(OBJS)
@echo " LD $@"
$(V)$(LD) -o $@ $(OBJS) $(BLOCKSDS)/sys/crts/ds_arm9_crt0.o $(LDFLAGS)
$(DUMP): $(ELF)
@echo " OBJDUMP $@"
$(V)$(OBJDUMP) -h -C -S $< > $@
dump: $(DUMP)
clean:
@echo " CLEAN"
$(V)$(RM) $(ROM) $(DUMP) $(BUILDDIR) $(SDIMAGE)
sdimage:
@echo " MKFATIMG $(SDIMAGE) $(SDROOT)"
$(V)$(BLOCKSDS)/tools/mkfatimg/mkfatimg -t $(SDROOT) $(SDIMAGE) 0
dldipatch: $(ROM)
@echo " DLDITOOL $(ROM)"
$(V)$(BLOCKSDS)/tools/dlditool/dlditool \
$(BLOCKSDS)/tools/dldi/r4tfv2.dldi $(ROM)
# Rules
# -----
$(BUILDDIR)/%.s.o : %.s
@echo " AS $<"
@$(MKDIR) -p $(@D)
$(V)$(CC) $(ASFLAGS) -MMD -MP -c -o $@ $<
$(BUILDDIR)/%.c.o : %.c
@echo " CC $<"
@$(MKDIR) -p $(@D)
$(V)$(CC) $(CFLAGS) -MMD -MP -c -o $@ $<
$(BUILDDIR)/%.cpp.o : %.cpp
@echo " CXX $<"
@$(MKDIR) -p $(@D)
$(V)$(CXX) $(CXXFLAGS) -MMD -MP -c -o $@ $<
$(BUILDDIR)/%.bin.o $(BUILDDIR)/%_bin.h : %.bin
@echo " BIN2C $<"
@$(MKDIR) -p $(@D)
$(V)$(BLOCKSDS)/tools/bin2c/bin2c $< $(@D)
$(V)$(CC) $(CFLAGS) -MMD -MP -c -o $(BUILDDIR)/$*.bin.o $(BUILDDIR)/$*_bin.c
$(BUILDDIR)/%.png.o $(BUILDDIR)/%.h : %.png %.grit
@echo " GRIT $<"
@$(MKDIR) -p $(@D)
$(V)$(BLOCKSDS)/tools/grit/grit $< -ftc -W1 -o$(BUILDDIR)/$*
$(V)$(CC) $(CFLAGS) -MMD -MP -c -o $(BUILDDIR)/$*.png.o $(BUILDDIR)/$*.c
$(SOUNDBANKDIR)/soundbank.h: $(SOURCES_AUDIO)
@echo " MMUTIL $^"
@$(MKDIR) -p $(@D)
@$(BLOCKSDS)/tools/mmutil/mmutil $^ -d \
-o$(SOUNDBANKDIR)/soundbank.bin -h$(SOUNDBANKDIR)/soundbank.h
$(SOUNDBANKDIR)/soundbank.c.o: $(SOUNDBANKDIR)/soundbank.h
@echo " BIN2C soundbank.bin"
$(V)$(BLOCKSDS)/tools/bin2c/bin2c $(SOUNDBANKDIR)/soundbank.bin \
$(SOUNDBANKDIR)
@echo " CC.9 soundbank_bin.c"
$(V)$(CC) $(CFLAGS) -MMD -MP -c -o $(SOUNDBANKDIR)/soundbank.c.o \
$(SOUNDBANKDIR)/soundbank_bin.c
# All assets must be built before the source code
# -----------------------------------------------
$(SOURCES_S) $(SOURCES_C) $(SOURCES_CPP): $(HEADERS_ASSETS)
# Include dependency files if they exist
# --------------------------------------
-include $(DEPS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment