Dario Salvati DWarez

## hello_world.cu
#include <stdio.h>

// Macro for checking CUDA errors
#define CUDA_CHECK_ERROR(err) \
    if (err != cudaSuccess) { \
        printf("CUDA err: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
        exit(EXIT_FAILURE); \
    }

// Kernel definition

## bc_tiled.cu
#include <stdio.h>

#define TILE_WIDTH 7
__global__ void matMulKernel(float* A, float* B, float* C, int Width) {
    __shared__ float Ads[TILE_WIDTH][TILE_WIDTH];
    __shared__ float Bds[TILE_WIDTH][TILE_WIDTH];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
	#include <stdio.h>

	// Macro for checking CUDA errors
	#define CUDA_CHECK_ERROR(err) \
	if (err != cudaSuccess) { \
	printf("CUDA err: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
	exit(EXIT_FAILURE); \
	}

	// Kernel definition
	#include <stdio.h>

	#define TILE_WIDTH 7
	__global__ void matMulKernel(float* A, float* B, float* C, int Width) {
	__shared__ float Ads[TILE_WIDTH][TILE_WIDTH];
	__shared__ float Bds[TILE_WIDTH][TILE_WIDTH];

	int bx = blockIdx.x;
	int by = blockIdx.y;
	int tx = threadIdx.x;