Created
August 22, 2017 15:27
-
-
Save bigbrett/a7a204ca42d65749a1bedff531e25c9c to your computer and use it in GitHub Desktop.
Xilinx kernel module demonstrating kernel<-->PL memory mapped acces with and without caches using kernel asm/io.h primitives
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* CPU <-> PL Communication Example | |
* | |
* This small example is designed to show how to communicate between the CPU and the PL | |
* without using DMA. For some applications the amount of data being transferred may be | |
* small enough that is quicker to not use DMA. | |
* | |
* This example assumes a BRAM is on the PL for the testing at a specific address. | |
* It includes several test methods, cached and non-cached, to compare the performance. | |
* There are likely more optimization methods that could be applied as this is not | |
* intended to be that (such as LDM/STRM). This BRAM is connected to the GP0 and GP1 | |
* master interfaces of the PS such that it can be used with a cached and non-cached | |
* mapping at the same time. This allows the cached operations to be verified using | |
* the non-cached interface. | |
* | |
* This is a kernel module that is inserted into the kernel to run a single test, then | |
* removed and reinserted to run another test. It has module parameters that can be | |
* set from the command line to cause different behavior such as cached or not, and | |
* write combined (for non-cached). All the parameters are optional as they default | |
* to specific values which can be seen in the code. | |
* | |
* cpu-pl-test.ko transfer_length=<length> cached=<0,1> write_combined=<0,1> | |
* | |
* An ILA can be used in the PL to review the AXI transactions that result from | |
* each of the transfers. | |
* | |
* Notes: | |
* | |
* 1. The time differencing for the performance numbers needs work for when the usec | |
* wraps around to zero such that you might randomly see a very large time number. | |
* | |
* 2. There are no memory barriers being used in this application and they might be | |
* needed in a real application. | |
*/ | |
#include <linux/dmaengine.h> | |
#include <linux/module.h> | |
#include <linux/version.h> | |
#include <linux/kernel.h> | |
#include <linux/slab.h> | |
#include <asm/cacheflush.h> | |
#include <asm/outercache.h> | |
static void __iomem *test_bram_p; | |
static void __iomem *test_bram_uncached_p; | |
unsigned int transfer_length = 1024; | |
module_param(transfer_length, int, S_IRUGO); | |
unsigned int cached = 1; | |
module_param(cached, int, S_IRUGO); | |
unsigned int write_combined = 0; | |
module_param(write_combined, int, S_IRUGO); | |
unsigned int zero = 1; | |
module_param(zero, int, S_IRUGO); | |
#define TEST_BRAM_ADDRESS 0x40000000 | |
#define TEST_BRAM_UNCACHED_ADDRESS 0x80000000 | |
#define TEST_BRAM_SIZE_BYTES (256 * 1024) | |
int test_cached_cpu_accesses(void) | |
{ | |
int i; | |
struct timeval time1, time2; | |
u32 *src_buffer; | |
u32 *dest_buffer; | |
u32 *p; | |
/* Create buffers to use for testing so that the BRAM is not operated | |
* on except where needed to minimize cache side affects | |
*/ | |
src_buffer = kmalloc(transfer_length, GFP_KERNEL); | |
dest_buffer = kzalloc(transfer_length, GFP_KERNEL); | |
if (!src_buffer || !dest_buffer) { | |
printk(KERN_ERR "Allocating DMA memory failed\n"); | |
return -1; | |
} | |
/* Initialize a source buffer with known data that is easy to see | |
* on the ILA from AXI | |
*/ | |
for (i = 0; i < transfer_length/4; i++) | |
src_buffer[i] = i; | |
/* Map the BRAM into the virtual address space as uncached memory | |
* so that it's as fast as possible | |
*/ | |
test_bram_uncached_p = ioremap(TEST_BRAM_UNCACHED_ADDRESS, TEST_BRAM_SIZE_BYTES); | |
if (!test_bram_uncached_p) { | |
printk("Could not map physical memory to virtual\n"); | |
return -1; | |
} | |
printk("bram uncached virtual address: %08X\n", (unsigned int)test_bram_uncached_p); | |
p = test_bram_uncached_p; | |
for (i = 0; i < transfer_length/4; i++) | |
__raw_writel(0, p++); | |
/* Map the BRAM into the virtual address space as cached memory | |
* so that it's as fast as possible | |
*/ | |
test_bram_p = ioremap_cache(TEST_BRAM_ADDRESS, TEST_BRAM_SIZE_BYTES); | |
if (!test_bram_p) { | |
printk("Could not map physical memory to virtual\n"); | |
return -1; | |
} | |
printk("bram virtual address: %08X\n", (unsigned int)test_bram_p); | |
p = test_bram_uncached_p; | |
for (i = 0; i < transfer_length/4; i++) | |
if (__raw_readl(p++) != 0) | |
printk("Error: BRAM not zeroed\n"); | |
/* Initialize the BRAM from the source buffer and then make sure it's | |
* written out from the cache to the memory (L1, then L2) | |
*/ | |
p = test_bram_p; | |
do_gettimeofday(&time1); | |
for (i = 0; i < transfer_length/4; i++) | |
__raw_writel(src_buffer[i], p++); | |
do_gettimeofday(&time2); | |
printk(KERN_INFO "cpu write time [us]: %ld\n", (time2.tv_usec - time1.tv_usec)); | |
/* Ideally BRAM is still zero, but some of the cache could have been written | |
* to memory if some lines needed to be evicted. It would be nice to test to test | |
* that memory is still zero but that can't be done. | |
*/ | |
/* Write the caches (L1 then L2) out to the BRAM, note the order of this | |
* is important as for writes caches operations must be done to the cache | |
* closest to CPU 1st | |
*/ | |
do_gettimeofday(&time1); | |
__cpuc_flush_dcache_area(test_bram_p, transfer_length); | |
outer_clean_range(TEST_BRAM_ADDRESS, TEST_BRAM_ADDRESS + (transfer_length - 1)); | |
do_gettimeofday(&time2); | |
printk(KERN_INFO "cpu write cache time [us]: %ld\n", (time2.tv_usec - time1.tv_usec)); | |
/* Verify the BRAM was written correctly to ensure that the caches were | |
* written | |
*/ | |
p = test_bram_uncached_p; | |
for (i = 0; i < transfer_length/4; i++) | |
if (__raw_readl(p++) != src_buffer[i]) | |
printk("Error: BRAM not written at address %08X\n", p - 1); | |
/* Initialize a source buffer with known different data to write to the BRAM | |
* without using the cache | |
*/ | |
for (i = 0; i < transfer_length/4; i++) | |
src_buffer[i] = (transfer_length/4) - i; | |
/* Write new data (different) to the BRAM thru the non-cached interface to verify that | |
* cache operations to invalidate it are working | |
*/ | |
p = test_bram_uncached_p; | |
for (i = 0; i < TEST_BRAM_SIZE_BYTES/4; i++) | |
__raw_writel(src_buffer[i], p++); | |
/* Invalidate the caches (L2, then L1) so that then next reads of the BRAM in cache should | |
* cause it to be fetched from memory, note for reads cache operations must be done to the | |
* cache closest to the memory 1st | |
*/ | |
do_gettimeofday(&time1); | |
outer_inv_range(TEST_BRAM_ADDRESS, TEST_BRAM_ADDRESS + (transfer_length - 1)); | |
__cpuc_flush_dcache_area(test_bram_p, transfer_length); | |
do_gettimeofday(&time2); | |
printk(KERN_INFO "cpu read cache time [us]: %ld\n", (time2.tv_usec - time1.tv_usec)); | |
/* Read from the BRAM into a buffer */ | |
p = test_bram_p; | |
do_gettimeofday(&time1); | |
for (i = 0; i < transfer_length/4; i++) | |
dest_buffer[i] = __raw_readl(p++); | |
do_gettimeofday(&time2); | |
printk(KERN_INFO "cpu read time [us]: %ld\n", (time2.tv_usec - time1.tv_usec)); | |
/* Verify the data was correct */ | |
for (i = 0; i < transfer_length/4; i++) { | |
if (dest_buffer[i] != src_buffer[i]) { | |
printk("index = %d, src = %02X, dest = %02X\n", i, src_buffer[i], dest_buffer[i]); | |
} | |
} | |
printk("Transferred %d bytes\n", transfer_length); | |
iounmap(test_bram_uncached_p); | |
iounmap(test_bram_p); | |
kfree(src_buffer); | |
kfree(dest_buffer); | |
return 0; | |
} | |
int test_noncached_cpu_accesses(void) | |
{ | |
int i; | |
struct timeval time1, time2; | |
u32 *src_buffer; | |
u32 *dest_buffer; | |
u32 *p; | |
/* Create buffers to use for testing so that the BRAM is not operated | |
* on except where needed to minimize cache side affects | |
*/ | |
src_buffer = kmalloc(transfer_length, GFP_KERNEL); | |
dest_buffer = kzalloc(transfer_length, GFP_KERNEL); | |
if (!src_buffer || !dest_buffer) { | |
printk(KERN_ERR "Allocating DMA memory failed\n"); | |
return -1; | |
} | |
/* Initialize a source buffer with known data that is easy to see | |
* on the ILA from AXI | |
*/ | |
for (i = 0; i < transfer_length/4; i++) { | |
src_buffer[i] = i; | |
} | |
/* Map the BRAM into the virtual address space as non-cached memory | |
* as normal device memory or device memory that is write combined | |
* so that writes are much faste | |
*/ | |
if (write_combined) | |
test_bram_p = ioremap_wc(TEST_BRAM_ADDRESS, TEST_BRAM_SIZE_BYTES); | |
else | |
test_bram_p = ioremap(TEST_BRAM_ADDRESS, TEST_BRAM_SIZE_BYTES); | |
if (!test_bram_p) { | |
printk("Could not map physical memory to virtual\n"); | |
return -1; | |
} | |
printk("bram virtual address: %08X\n", (unsigned int)test_bram_p); | |
/* Write the data to the BRAM and measure the time */ | |
do_gettimeofday(&time1); | |
p = test_bram_p; | |
for (i = 0; i < transfer_length/4; i++) | |
__raw_writel(src_buffer[i], p++); | |
do_gettimeofday(&time2); | |
printk(KERN_INFO "cpu write time [us]: %ld\n", (time2.tv_usec - time1.tv_usec)); | |
/* Read the data back from the BRAM and measure the time */ | |
p = test_bram_p; | |
do_gettimeofday(&time1); | |
for (i = 0; i < transfer_length/4; i++) | |
dest_buffer[i] = __raw_readl(p++); | |
do_gettimeofday(&time2); | |
printk(KERN_INFO "cpu read time [us]: %ld\n", (time2.tv_usec - time1.tv_usec)); | |
/* Verify the data was correct */ | |
for (i = 0; i < transfer_length/4; i++) { | |
if (dest_buffer[i] != src_buffer[i]) { | |
printk("index = %d, src = %02X, dest = %02X\n", i, src_buffer[i], dest_buffer[i]); | |
} | |
} | |
printk("Transferred %d bytes\n", transfer_length); | |
iounmap(test_bram_p); | |
kfree(src_buffer); | |
kfree(dest_buffer); | |
return 0; | |
} | |
static int __init cpu_test_init(void) | |
{ | |
printk(KERN_INFO "CPU<->PL test module initialized\n"); | |
printk(KERN_INFO "transfer length: %d cached: %d write combined: %d\n", | |
transfer_length, cached, write_combined); | |
if (cached) | |
test_cached_cpu_accesses(); | |
else | |
test_noncached_cpu_accesses(); | |
return 0; | |
} | |
static void __exit cpu_test_exit(void) | |
{ | |
printk(KERN_INFO "CPU<->PL test module exited\n"); | |
} | |
module_init(cpu_test_init); | |
module_exit(cpu_test_exit); | |
MODULE_LICENSE("GPL"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment