mickeyouyou/test.cu

## test.cu

#include<cuda.h>

#define THREAD_NUM 256
#define BLOCK_NUM 32
#define R_SIZE THREAD_NUM * BLOCK_NUM
#define M_SIZE R_SIZE * R_SIZE


__global__ void mat_mul(int* mat1, int* mat2, int* result){

    int bid = blockIdx.x;
    int tid = threadIdx.x;

    int index = bid * THRREAD_NUM  + tid;

    for(int c=0; i< R_SIZE; i++) {
        for(int n= 0; n < R_SIZE;n++) {
            result[index * R_SIZE + c] +=mat1[index*R_SIZE+n]* mat2[n*R_SIZE+c];
        }

    {

}

int main(){
    int* mat1;
    int* mat2;
    int* result;

    int* g_mat1;
    int* g_mat2;
    int* g_mat_result;


    mat1 = (int*) malloc(M_SIZE*sizeof(int);
    // same to mat2 result

    cudaMalloc((void**)&g_mat1, sizeof(int)* M_SIZE);
    // some to g_mat2  g_mat_result;


    cudaMemcpy(g_mat1, mat1, sizeof(int)* M_SIZE, cudaMemcpyHosttoDevice);
    // some from mat2 to g_mat2;

    mat_mul<<BLOCK_NUM, THREAD_NUM>>>(g_mat1, g_mat2, g_mat_result);


    cudaMemcpy(result, g_mat_result, sizeof(int) * M_SIZE, cudaMemcpyDevicetohost);


}

	#include<cuda.h>

	#define THREAD_NUM 256
	#define BLOCK_NUM 32
	#define R_SIZE THREAD_NUM * BLOCK_NUM
	#define M_SIZE R_SIZE * R_SIZE


	__global__ void mat_mul(int* mat1, int* mat2, int* result){

	int bid = blockIdx.x;
	int tid = threadIdx.x;

	int index = bid * THRREAD_NUM + tid;

	for(int c=0; i< R_SIZE; i++) {
	for(int n= 0; n < R_SIZE;n++) {
	result[index * R_SIZE + c] +=mat1[indexR_SIZE+n] mat2[n*R_SIZE+c];
	}

	{

	}

	int main(){
	int* mat1;
	int* mat2;
	int* result;

	int* g_mat1;
	int* g_mat2;
	int* g_mat_result;


	mat1 = (int) malloc(M_SIZEsizeof(int);
	// same to mat2 result

	cudaMalloc((void*)&g_mat1, sizeof(int) M_SIZE);
	// some to g_mat2 g_mat_result;


	cudaMemcpy(g_mat1, mat1, sizeof(int)* M_SIZE, cudaMemcpyHosttoDevice);
	// some from mat2 to g_mat2;

	mat_mul<<BLOCK_NUM, THREAD_NUM>>>(g_mat1, g_mat2, g_mat_result);


	cudaMemcpy(result, g_mat_result, sizeof(int) * M_SIZE, cudaMemcpyDevicetohost);




	}