Skip to content

Instantly share code, notes, and snippets.

@iambrj
Created November 6, 2021 16:18
Show Gist options
  • Save iambrj/1500993ba4e031e8b50b52c040f9a702 to your computer and use it in GitHub Desktop.
Save iambrj/1500993ba4e031e8b50b52c040f9a702 to your computer and use it in GitHub Desktop.
/**********
Copyright (c) 2019-2020, Xilinx, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********/
#include "event_timer.hpp"
#include <iostream>
#include <memory>
#include <string>
// Xilinx OpenCL and XRT includes
#include "xilinx_ocl.hpp"
void vmul_sw(float *a, float *b, float *c, uint32_t size)
{
for (uint32_t i = 0; i < size; i++) {
c[i] = a[i] * b[i];
}
}
int main(int argc, char *argv[])
{
// Initialize an event timer we'll use for monitoring the application
EventTimer et;
// Check if the binary file is passed as argument
if (argc != 2) {
std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
return EXIT_FAILURE;
}
// Copy binary name
char* binaryName = argv[1];
// Get target and set BUFSIZE 1024 times bigger for hw runs
std::string target = getenv("XCL_EMULATION_MODE");
uint32_t BUFSIZE = (target.compare("hw") == 0) ? (1024 * 1024 * 32) : (1024 * 32);
if(argc == 3) {
BUFSIZE = std::stoi(argv[2]);
}
std::cout << "-- Parallelizing the Data Path --" << std::endl << std::endl;
// Initialize the runtime (including a command queue) and load the
// FPGA image
std::cout << "Loading " << binaryName << " to program the board" << std::endl << std::endl;
et.add("OpenCL Initialization");
// This application will use the first Xilinx device found in the system
swm::XilinxOcl xocl;
xocl.initialize(binaryName);
cl::CommandQueue q = xocl.get_command_queue();
cl::Kernel krnl = xocl.get_kernel("wide_vmul");
et.finish();
/// New code for example 01
std::cout << "Running kernel test XRT-allocated buffers and wide data path:" << std::endl
<< std::endl;
// Map our user-allocated buffers as OpenCL buffers using a shared
// host pointer
et.add("Allocate contiguous OpenCL buffers");
cl_mem_ext_ptr_t bank_ext;
bank_ext.flags = 0 | XCL_MEM_TOPOLOGY;
bank_ext.obj = NULL;
bank_ext.param = 0;
cl::Buffer a_buf(xocl.get_context(),
static_cast<cl_mem_flags>(CL_MEM_READ_ONLY),
BUFSIZE * sizeof(float),
NULL,
NULL);
cl::Buffer b_buf(xocl.get_context(),
static_cast<cl_mem_flags>(CL_MEM_READ_ONLY),
BUFSIZE * sizeof(float),
NULL,
NULL);
cl::Buffer c_buf(xocl.get_context(),
static_cast<cl_mem_flags>(CL_MEM_READ_WRITE),
BUFSIZE * sizeof(float),
NULL,
NULL);
cl::Buffer d_buf(xocl.get_context(),
static_cast<cl_mem_flags>(CL_MEM_READ_WRITE |
CL_MEM_ALLOC_HOST_PTR |
CL_MEM_EXT_PTR_XILINX),
BUFSIZE * sizeof(float),
&bank_ext,
NULL);
et.finish();
// Set vmul kernel arguments. We do this before mapping the buffers to allow XRT
// to allocate the buffers in the appropriate memory banks for the selected
// kernels. For buffer 'd' we explicitly set a bank above, but this buffer is
// never migrated to the Alveo card so this mapping is theoretical.
et.add("Set kernel arguments");
krnl.setArg(0, a_buf);
krnl.setArg(1, b_buf);
krnl.setArg(2, c_buf);
krnl.setArg(3, BUFSIZE);
et.add("Map buffers to user space pointers");
float *a = (float *)q.enqueueMapBuffer(a_buf,
CL_TRUE,
CL_MAP_WRITE,
0,
BUFSIZE * sizeof(float));
float *b = (float *)q.enqueueMapBuffer(b_buf,
CL_TRUE,
CL_MAP_WRITE,
0,
BUFSIZE * sizeof(float));
float *d = (float *)q.enqueueMapBuffer(d_buf,
CL_TRUE,
CL_MAP_WRITE | CL_MAP_READ,
0,
BUFSIZE * sizeof(float));
et.finish();
et.add("Populating buffer inputs");
for (uint32_t i = 0; i < BUFSIZE; i++) {
a[i] = i;
b[i] = 2 * i;
}
et.finish();
// For comparison, let's have the CPU calculate the result
et.add("Software VADD run");
vmul(a, b, d, BUFSIZE);
et.finish();
// Send the buffers down to the Alveo card
et.add("Memory object migration enqueue");
cl::Event event_sp;
q.enqueueMigrateMemObjects({a_buf, b_buf}, 0, NULL, &event_sp);
clWaitForEvents(1, (const cl_event *)&event_sp);
et.add("OCL Enqueue task");
q.enqueueTask(krnl, NULL, &event_sp);
et.add("Wait for kernel to complete");
clWaitForEvents(1, (const cl_event *)&event_sp);
// Migrate memory back from device
et.add("Read back computation results");
float *c = (float *)q.enqueueMapBuffer(c_buf,
CL_TRUE,
CL_MAP_READ,
0,
BUFSIZE * sizeof(float));
et.finish();
// Verify the results
bool verified = true;
for (uint32_t i = 0; i < BUFSIZE; i++) {
if (c[i] != d[i]) {
verified = false;
std::cout << "ERROR: software and hardware vmul do not match: "
<< c[i] << "!=" << d[i] << " at position " << i << std::endl;
break;
}
}
if (verified) {
std::cout
<< std::endl
<< "OCL-mapped contiguous buffer example complete successfully!"
<< std::endl
<< std::endl;
}
else {
std::cout
<< std::endl
<< "OCL-mapped contiguous buffer example complete! (with errors)"
<< std::endl
<< std::endl;
}
std::cout << "--------------- Key execution times ---------------" << std::endl;
q.enqueueUnmapMemObject(a_buf, a);
q.enqueueUnmapMemObject(b_buf, b);
q.enqueueUnmapMemObject(c_buf, c);
q.enqueueUnmapMemObject(d_buf, d);
q.finish();
et.print();
}
/**********
Copyright (c) 2018-2020, Xilinx, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********/
/*******************************************************************************
Description:
Wide Memory Access Example using ap_uint<Width> datatype
Description: This is vector addition example to demonstrate Wide Memory
access of 512bit Datawidth using ap_uint<> datatype which is defined inside
'ap_int.h' file.
*******************************************************************************/
//Including to use ap_uint<> datatype
#include <ap_int.h>
#include <stdio.h>
#include <string.h>
#define BUFFER_SIZE 64
#define DATAWIDTH 512
#define VECTOR_SIZE (DATAWIDTH / 32) // vector size is 16 (512/32 = 16)
typedef ap_uint<DATAWIDTH> uint512_dt;
//TRIPCOUNT identifier
const unsigned int c_chunk_sz = BUFFER_SIZE;
const unsigned int c_size = VECTOR_SIZE;
/*
Vector Addition Kernel Implementation using uint512_dt datatype
Arguments:
in1 (input) --> Input Vector1
in2 (input) --> Input Vector2
out (output) --> Output Vector
size (input) --> Size of Vector in Integer
*/
extern "C"
{
void wide_vmul(
const uint512_dt *in1, // Read-Only Vector 1
const uint512_dt *in2, // Read-Only Vector 2
uint512_dt *out, // Output Result
int size // Size in integer
)
{
#pragma HLS INTERFACE m_axi port = in1 max_read_burst_length = 32 offset = slave bundle = gmem
#pragma HLS INTERFACE m_axi port = in2 max_read_burst_length = 32 offset = slave bundle = gmem1
#pragma HLS INTERFACE m_axi port = out max_write_burst_length = 32 offset = slave bundle = gmem2
#pragma HLS INTERFACE s_axilite port = in1 bundle = control
#pragma HLS INTERFACE s_axilite port = in2 bundle = control
#pragma HLS INTERFACE s_axilite port = out bundle = control
#pragma HLS INTERFACE s_axilite port = size bundle = control
#pragma HLS INTERFACE s_axilite port = return bundle = control
uint512_dt v1_local[BUFFER_SIZE]; // Local memory to store vector1
uint512_dt v2_local[BUFFER_SIZE];
// Input vector size for integer vectors. However kernel is directly
// accessing 512bit data (total 16 elements). So total number of read
// from global memory is calculated here:
int size_in16 = (size - 1) / VECTOR_SIZE + 1;
//Per iteration of this loop perform BUFFER_SIZE vector multiplication
for (int i = 0; i < size_in16; i += BUFFER_SIZE) {
//#pragma HLS PIPELINE
#pragma HLS DATAFLOW
#pragma HLS stream variable = v1_local depth = 64
#pragma HLS stream variable = v2_local depth = 64
int chunk_size = BUFFER_SIZE;
//boundary checks
if ((i + BUFFER_SIZE) > size_in16)
chunk_size = size_in16 - i;
//burst read from both input vectors at the same time from global memory to local memory
v1_rd:
for (int j = 0; j < chunk_size; j++) {
#pragma HLS pipeline
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64
v1_local[j] = in1[i + j];
v2_local[j] = in2[i + j];
}
//burst read second vector and perform vector addition
v2_rd_add:
for (int j = 0; j < chunk_size; j++) {
#pragma HLS pipeline
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64
uint512_dt tmpV1 = v1_local[j];
uint512_dt tmpV2 = v2_local[j];
uint512_dt tmpV3 = 0;
vec_sum: for (unsigned int s = 0; s < DATAWIDTH; s+= 32){
#pragma HLS unroll
// add the 32-bit elements individually and compose the output vector
tmpV3(s + 31, s) = tmpV1(s + 31, s) * tmpV2(s + 31, s);
}
out[i + j] = tmpV3;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment