iambrj/wide_vadd.cpp

## wide_vadd.cpp
/**********
Copyright (c) 2019-2020, Xilinx, Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********/


#include "event_timer.hpp"

#include <iostream>
#include <memory>
#include <string>

// Xilinx OpenCL and XRT includes
#include "xilinx_ocl.hpp"

void vmul_sw(float *a, float *b, float *c, uint32_t size)
{
    for (uint32_t i = 0; i < size; i++) {
        c[i] = a[i] * b[i];
    }
}

int main(int argc, char *argv[])
{
    // Initialize an event timer we'll use for monitoring the application
    EventTimer et;
    // Check if the binary file is passed as argument
    if (argc != 2) {
      std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
      return EXIT_FAILURE;
    }
    // Copy binary name
    char* binaryName = argv[1];
    // Get target and set BUFSIZE 1024 times bigger for hw runs
    std::string target = getenv("XCL_EMULATION_MODE");
    uint32_t BUFSIZE = (target.compare("hw") == 0) ? (1024 * 1024 * 32) : (1024 * 32);
    if(argc == 3) {
      BUFSIZE = std::stoi(argv[2]);
    }

    std::cout << "-- Parallelizing the Data Path --" << std::endl << std::endl;

    // Initialize the runtime (including a command queue) and load the
    // FPGA image
    std::cout << "Loading " << binaryName << " to program the board" << std::endl << std::endl;
    et.add("OpenCL Initialization");

    // This application will use the first Xilinx device found in the system
    swm::XilinxOcl xocl;
    xocl.initialize(binaryName);

    cl::CommandQueue q = xocl.get_command_queue();
    cl::Kernel krnl    = xocl.get_kernel("wide_vmul");
    et.finish();

    /// New code for example 01
    std::cout << "Running kernel test XRT-allocated buffers and wide data path:" << std::endl
              << std::endl;

    // Map our user-allocated buffers as OpenCL buffers using a shared
    // host pointer
    et.add("Allocate contiguous OpenCL buffers");
    cl_mem_ext_ptr_t bank_ext;
    bank_ext.flags = 0 | XCL_MEM_TOPOLOGY;
    bank_ext.obj   = NULL;
    bank_ext.param = 0;
    cl::Buffer a_buf(xocl.get_context(),
                     static_cast<cl_mem_flags>(CL_MEM_READ_ONLY),
                     BUFSIZE * sizeof(float),
                     NULL,
                     NULL);
    cl::Buffer b_buf(xocl.get_context(),
                     static_cast<cl_mem_flags>(CL_MEM_READ_ONLY),
                     BUFSIZE * sizeof(float),
                     NULL,
                     NULL);
    cl::Buffer c_buf(xocl.get_context(),
                     static_cast<cl_mem_flags>(CL_MEM_READ_WRITE),
                     BUFSIZE * sizeof(float),
                     NULL,
                     NULL);
    cl::Buffer d_buf(xocl.get_context(),
                     static_cast<cl_mem_flags>(CL_MEM_READ_WRITE |
                                               CL_MEM_ALLOC_HOST_PTR |
                                               CL_MEM_EXT_PTR_XILINX),
                     BUFSIZE * sizeof(float),
                     &bank_ext,
                     NULL);
    et.finish();

    // Set vmul kernel arguments. We do this before mapping the buffers to allow XRT
    // to allocate the buffers in the appropriate memory banks for the selected
    // kernels. For buffer 'd' we explicitly set a bank above, but this buffer is
    // never migrated to the Alveo card so this mapping is theoretical.
    et.add("Set kernel arguments");
    krnl.setArg(0, a_buf);
    krnl.setArg(1, b_buf);
    krnl.setArg(2, c_buf);
    krnl.setArg(3, BUFSIZE);

    et.add("Map buffers to user space pointers");
    float *a = (float *)q.enqueueMapBuffer(a_buf,
                                                 CL_TRUE,
                                                 CL_MAP_WRITE,
                                                 0,
                                                 BUFSIZE * sizeof(float));
    float *b = (float *)q.enqueueMapBuffer(b_buf,
                                                 CL_TRUE,
                                                 CL_MAP_WRITE,
                                                 0,
                                                 BUFSIZE * sizeof(float));
    float *d = (float *)q.enqueueMapBuffer(d_buf,
                                                 CL_TRUE,
                                                 CL_MAP_WRITE | CL_MAP_READ,
                                                 0,
                                                 BUFSIZE * sizeof(float));
    et.finish();

    et.add("Populating buffer inputs");
    for (uint32_t i = 0; i < BUFSIZE; i++) {
        a[i] = i;
        b[i] = 2 * i;
    }
    et.finish();

    // For comparison, let's have the CPU calculate the result
    et.add("Software VADD run");
    vmul(a, b, d, BUFSIZE);
    et.finish();

    // Send the buffers down to the Alveo card
    et.add("Memory object migration enqueue");
    cl::Event event_sp;
    q.enqueueMigrateMemObjects({a_buf, b_buf}, 0, NULL, &event_sp);
    clWaitForEvents(1, (const cl_event *)&event_sp);

    et.add("OCL Enqueue task");

    q.enqueueTask(krnl, NULL, &event_sp);
    et.add("Wait for kernel to complete");
    clWaitForEvents(1, (const cl_event *)&event_sp);

    // Migrate memory back from device
    et.add("Read back computation results");
    float *c = (float *)q.enqueueMapBuffer(c_buf,
                                                 CL_TRUE,
                                                 CL_MAP_READ,
                                                 0,
                                                 BUFSIZE * sizeof(float));
    et.finish();


    // Verify the results
    bool verified = true;
    for (uint32_t i = 0; i < BUFSIZE; i++) {
        if (c[i] != d[i]) {
            verified = false;
            std::cout << "ERROR: software and hardware vmul do not match: "
                      << c[i] << "!=" << d[i] << " at position " << i << std::endl;
            break;
        }
    }

    if (verified) {
        std::cout
            << std::endl
            << "OCL-mapped contiguous buffer example complete successfully!"
            << std::endl
            << std::endl;
    }
    else {
        std::cout
            << std::endl
            << "OCL-mapped contiguous buffer example complete! (with errors)"
            << std::endl
            << std::endl;
    }

    std::cout << "--------------- Key execution times ---------------" << std::endl;

    q.enqueueUnmapMemObject(a_buf, a);
    q.enqueueUnmapMemObject(b_buf, b);
    q.enqueueUnmapMemObject(c_buf, c);
    q.enqueueUnmapMemObject(d_buf, d);
    q.finish();

    et.print();
}

## wide_vadd_krnl.cpp
/**********
Copyright (c) 2018-2020, Xilinx, Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********/

/*******************************************************************************
Description:
    Wide Memory Access Example using ap_uint<Width> datatype
    Description: This is vector addition example to demonstrate Wide Memory
    access of 512bit Datawidth using ap_uint<> datatype which is defined inside
    'ap_int.h' file.
*******************************************************************************/

//Including to use ap_uint<> datatype
#include <ap_int.h>
#include <stdio.h>
#include <string.h>

#define BUFFER_SIZE 64
#define DATAWIDTH 512
#define VECTOR_SIZE (DATAWIDTH / 32) // vector size is 16 (512/32 = 16)
typedef ap_uint<DATAWIDTH> uint512_dt;

//TRIPCOUNT identifier
const unsigned int c_chunk_sz = BUFFER_SIZE;
const unsigned int c_size     = VECTOR_SIZE;

/*
    Vector Addition Kernel Implementation using uint512_dt datatype
    Arguments:
        in1   (input)     --> Input Vector1
        in2   (input)     --> Input Vector2
        out   (output)    --> Output Vector
        size  (input)     --> Size of Vector in Integer
   */
extern "C"
{
    void wide_vmul(
        const uint512_dt *in1, // Read-Only Vector 1
        const uint512_dt *in2, // Read-Only Vector 2
        uint512_dt *out,       // Output Result
        int size               // Size in integer
    )
    {
#pragma HLS INTERFACE m_axi port = in1 max_read_burst_length = 32  offset = slave bundle = gmem
#pragma HLS INTERFACE m_axi port = in2 max_read_burst_length = 32  offset = slave bundle = gmem1
#pragma HLS INTERFACE m_axi port = out max_write_burst_length = 32 offset = slave bundle = gmem2
#pragma HLS INTERFACE s_axilite port = in1 bundle = control
#pragma HLS INTERFACE s_axilite port = in2 bundle = control
#pragma HLS INTERFACE s_axilite port = out bundle = control
#pragma HLS INTERFACE s_axilite port = size bundle = control
#pragma HLS INTERFACE s_axilite port = return bundle = control

        uint512_dt v1_local[BUFFER_SIZE]; // Local memory to store vector1
        uint512_dt v2_local[BUFFER_SIZE];

        // Input vector size for integer vectors. However kernel is directly
        // accessing 512bit data (total 16 elements). So total number of read
        // from global memory is calculated here:
        int size_in16 = (size - 1) / VECTOR_SIZE + 1;

        //Per iteration of this loop perform BUFFER_SIZE vector multiplication
        for (int i = 0; i < size_in16; i += BUFFER_SIZE) {
//#pragma HLS PIPELINE
#pragma HLS DATAFLOW
#pragma HLS stream variable = v1_local depth = 64
#pragma HLS stream variable = v2_local depth = 64

            int chunk_size = BUFFER_SIZE;

            //boundary checks
            if ((i + BUFFER_SIZE) > size_in16)
                chunk_size = size_in16 - i;

        //burst read from both input vectors at the same time from global memory to local memory
        v1_rd:
            for (int j = 0; j < chunk_size; j++) {
#pragma HLS pipeline
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64
                v1_local[j] = in1[i + j];
                v2_local[j] = in2[i + j];
            }

        //burst read second vector and perform vector addition
        v2_rd_add:
            for (int j = 0; j < chunk_size; j++) {
#pragma HLS pipeline
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64
                uint512_dt tmpV1 = v1_local[j];
                uint512_dt tmpV2 = v2_local[j];
                uint512_dt tmpV3 = 0;
                vec_sum: for (unsigned int s = 0; s < DATAWIDTH; s+= 32){
#pragma HLS unroll
                    // add the 32-bit elements individually and compose the output vector
                    tmpV3(s + 31, s) = tmpV1(s + 31, s) * tmpV2(s + 31, s);
                }
                out[i + j] = tmpV3;
            }
        }
    }
}
	/**********
	Copyright (c) 2019-2020, Xilinx, Inc.
	All rights reserved.

	Redistribution and use in source and binary forms, with or without modification,
	are permitted provided that the following conditions are met:

	1. Redistributions of source code must retain the above copyright notice,
	this list of conditions and the following disclaimer.

	2. Redistributions in binary form must reproduce the above copyright notice,
	this list of conditions and the following disclaimer in the documentation
	and/or other materials provided with the distribution.

	3. Neither the name of the copyright holder nor the names of its contributors
	may be used to endorse or promote products derived from this software
	without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
	THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
	EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	**********/


	#include "event_timer.hpp"

	#include <iostream>
	#include <memory>
	#include <string>

	// Xilinx OpenCL and XRT includes
	#include "xilinx_ocl.hpp"

	void vmul_sw(float a, float b, float *c, uint32_t size)
	{
	for (uint32_t i = 0; i < size; i++) {
	c[i] = a[i] * b[i];
	}
	}

	int main(int argc, char *argv[])
	{
	// Initialize an event timer we'll use for monitoring the application
	EventTimer et;
	// Check if the binary file is passed as argument
	if (argc != 2) {
	std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
	return EXIT_FAILURE;
	}
	// Copy binary name
	char* binaryName = argv[1];
	// Get target and set BUFSIZE 1024 times bigger for hw runs
	std::string target = getenv("XCL_EMULATION_MODE");
	uint32_t BUFSIZE = (target.compare("hw") == 0) ? (1024 * 1024 * 32) : (1024 * 32);
	if(argc == 3) {
	BUFSIZE = std::stoi(argv[2]);
	}

	std::cout << "-- Parallelizing the Data Path --" << std::endl << std::endl;

	// Initialize the runtime (including a command queue) and load the
	// FPGA image
	std::cout << "Loading " << binaryName << " to program the board" << std::endl << std::endl;
	et.add("OpenCL Initialization");

	// This application will use the first Xilinx device found in the system
	swm::XilinxOcl xocl;
	xocl.initialize(binaryName);

	cl::CommandQueue q = xocl.get_command_queue();
	cl::Kernel krnl = xocl.get_kernel("wide_vmul");
	et.finish();

	/// New code for example 01
	std::cout << "Running kernel test XRT-allocated buffers and wide data path:" << std::endl
	<< std::endl;

	// Map our user-allocated buffers as OpenCL buffers using a shared
	// host pointer
	et.add("Allocate contiguous OpenCL buffers");
	cl_mem_ext_ptr_t bank_ext;
	bank_ext.flags = 0 \| XCL_MEM_TOPOLOGY;
	bank_ext.obj = NULL;
	bank_ext.param = 0;
	cl::Buffer a_buf(xocl.get_context(),
	static_cast<cl_mem_flags>(CL_MEM_READ_ONLY),
	BUFSIZE * sizeof(float),
	NULL,
	NULL);
	cl::Buffer b_buf(xocl.get_context(),
	static_cast<cl_mem_flags>(CL_MEM_READ_ONLY),
	BUFSIZE * sizeof(float),
	NULL,
	NULL);
	cl::Buffer c_buf(xocl.get_context(),
	static_cast<cl_mem_flags>(CL_MEM_READ_WRITE),
	BUFSIZE * sizeof(float),
	NULL,
	NULL);
	cl::Buffer d_buf(xocl.get_context(),
	static_cast<cl_mem_flags>(CL_MEM_READ_WRITE \|
	CL_MEM_ALLOC_HOST_PTR \|
	CL_MEM_EXT_PTR_XILINX),
	BUFSIZE * sizeof(float),
	&bank_ext,
	NULL);
	et.finish();

	// Set vmul kernel arguments. We do this before mapping the buffers to allow XRT
	// to allocate the buffers in the appropriate memory banks for the selected
	// kernels. For buffer 'd' we explicitly set a bank above, but this buffer is
	// never migrated to the Alveo card so this mapping is theoretical.
	et.add("Set kernel arguments");
	krnl.setArg(0, a_buf);
	krnl.setArg(1, b_buf);
	krnl.setArg(2, c_buf);
	krnl.setArg(3, BUFSIZE);

	et.add("Map buffers to user space pointers");
	float a = (float )q.enqueueMapBuffer(a_buf,
	CL_TRUE,
	CL_MAP_WRITE,
	0,
	BUFSIZE * sizeof(float));
	float b = (float )q.enqueueMapBuffer(b_buf,
	CL_TRUE,
	CL_MAP_WRITE,
	0,
	BUFSIZE * sizeof(float));
	float d = (float )q.enqueueMapBuffer(d_buf,
	CL_TRUE,
	CL_MAP_WRITE \| CL_MAP_READ,
	0,
	BUFSIZE * sizeof(float));
	et.finish();

	et.add("Populating buffer inputs");
	for (uint32_t i = 0; i < BUFSIZE; i++) {
	a[i] = i;
	b[i] = 2 * i;
	}
	et.finish();

	// For comparison, let's have the CPU calculate the result
	et.add("Software VADD run");
	vmul(a, b, d, BUFSIZE);
	et.finish();

	// Send the buffers down to the Alveo card
	et.add("Memory object migration enqueue");
	cl::Event event_sp;
	q.enqueueMigrateMemObjects({a_buf, b_buf}, 0, NULL, &event_sp);
	clWaitForEvents(1, (const cl_event *)&event_sp);

	et.add("OCL Enqueue task");

	q.enqueueTask(krnl, NULL, &event_sp);
	et.add("Wait for kernel to complete");
	clWaitForEvents(1, (const cl_event *)&event_sp);

	// Migrate memory back from device
	et.add("Read back computation results");
	float c = (float )q.enqueueMapBuffer(c_buf,
	CL_TRUE,
	CL_MAP_READ,
	0,
	BUFSIZE * sizeof(float));
	et.finish();


	// Verify the results
	bool verified = true;
	for (uint32_t i = 0; i < BUFSIZE; i++) {
	if (c[i] != d[i]) {
	verified = false;
	std::cout << "ERROR: software and hardware vmul do not match: "
	<< c[i] << "!=" << d[i] << " at position " << i << std::endl;
	break;
	}
	}

	if (verified) {
	std::cout
	<< std::endl
	<< "OCL-mapped contiguous buffer example complete successfully!"
	<< std::endl
	<< std::endl;
	}
	else {
	std::cout
	<< std::endl
	<< "OCL-mapped contiguous buffer example complete! (with errors)"
	<< std::endl
	<< std::endl;
	}

	std::cout << "--------------- Key execution times ---------------" << std::endl;

	q.enqueueUnmapMemObject(a_buf, a);
	q.enqueueUnmapMemObject(b_buf, b);
	q.enqueueUnmapMemObject(c_buf, c);
	q.enqueueUnmapMemObject(d_buf, d);
	q.finish();

	et.print();
	}
	/**********
	Copyright (c) 2018-2020, Xilinx, Inc.
	All rights reserved.

	Redistribution and use in source and binary forms, with or without modification,
	are permitted provided that the following conditions are met:

	1. Redistributions of source code must retain the above copyright notice,
	this list of conditions and the following disclaimer.

	2. Redistributions in binary form must reproduce the above copyright notice,
	this list of conditions and the following disclaimer in the documentation
	and/or other materials provided with the distribution.

	3. Neither the name of the copyright holder nor the names of its contributors
	may be used to endorse or promote products derived from this software
	without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
	THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
	EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	**********/

	/*******************************************************************************
	Description:
	Wide Memory Access Example using ap_uint<Width> datatype
	Description: This is vector addition example to demonstrate Wide Memory
	access of 512bit Datawidth using ap_uint<> datatype which is defined inside
	'ap_int.h' file.
	*******************************************************************************/

	//Including to use ap_uint<> datatype
	#include <ap_int.h>
	#include <stdio.h>
	#include <string.h>

	#define BUFFER_SIZE 64
	#define DATAWIDTH 512
	#define VECTOR_SIZE (DATAWIDTH / 32) // vector size is 16 (512/32 = 16)
	typedef ap_uint<DATAWIDTH> uint512_dt;

	//TRIPCOUNT identifier
	const unsigned int c_chunk_sz = BUFFER_SIZE;
	const unsigned int c_size = VECTOR_SIZE;

	/*
	Vector Addition Kernel Implementation using uint512_dt datatype
	Arguments:
	in1 (input) --> Input Vector1
	in2 (input) --> Input Vector2
	out (output) --> Output Vector
	size (input) --> Size of Vector in Integer
	*/
	extern "C"
	{
	void wide_vmul(
	const uint512_dt *in1, // Read-Only Vector 1
	const uint512_dt *in2, // Read-Only Vector 2
	uint512_dt *out, // Output Result
	int size // Size in integer
	)
	{
	#pragma HLS INTERFACE m_axi port = in1 max_read_burst_length = 32 offset = slave bundle = gmem
	#pragma HLS INTERFACE m_axi port = in2 max_read_burst_length = 32 offset = slave bundle = gmem1
	#pragma HLS INTERFACE m_axi port = out max_write_burst_length = 32 offset = slave bundle = gmem2
	#pragma HLS INTERFACE s_axilite port = in1 bundle = control
	#pragma HLS INTERFACE s_axilite port = in2 bundle = control
	#pragma HLS INTERFACE s_axilite port = out bundle = control
	#pragma HLS INTERFACE s_axilite port = size bundle = control
	#pragma HLS INTERFACE s_axilite port = return bundle = control

	uint512_dt v1_local[BUFFER_SIZE]; // Local memory to store vector1
	uint512_dt v2_local[BUFFER_SIZE];

	// Input vector size for integer vectors. However kernel is directly
	// accessing 512bit data (total 16 elements). So total number of read
	// from global memory is calculated here:
	int size_in16 = (size - 1) / VECTOR_SIZE + 1;

	//Per iteration of this loop perform BUFFER_SIZE vector multiplication
	for (int i = 0; i < size_in16; i += BUFFER_SIZE) {
	//#pragma HLS PIPELINE
	#pragma HLS DATAFLOW
	#pragma HLS stream variable = v1_local depth = 64
	#pragma HLS stream variable = v2_local depth = 64

	int chunk_size = BUFFER_SIZE;

	//boundary checks
	if ((i + BUFFER_SIZE) > size_in16)
	chunk_size = size_in16 - i;

	//burst read from both input vectors at the same time from global memory to local memory
	v1_rd:
	for (int j = 0; j < chunk_size; j++) {
	#pragma HLS pipeline
	#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64
	v1_local[j] = in1[i + j];
	v2_local[j] = in2[i + j];
	}

	//burst read second vector and perform vector addition
	v2_rd_add:
	for (int j = 0; j < chunk_size; j++) {
	#pragma HLS pipeline
	#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64
	uint512_dt tmpV1 = v1_local[j];
	uint512_dt tmpV2 = v2_local[j];
	uint512_dt tmpV3 = 0;
	vec_sum: for (unsigned int s = 0; s < DATAWIDTH; s+= 32){
	#pragma HLS unroll
	// add the 32-bit elements individually and compose the output vector
	tmpV3(s + 31, s) = tmpV1(s + 31, s) * tmpV2(s + 31, s);
	}
	out[i + j] = tmpV3;
	}
	}
	}
	}