Created
May 18, 2020 08:44
-
-
Save nghiahsgs/68ba08a8dc2ba404bd87e1d38b411588 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
///usr/local/cuda-10.2/bin/nvcc test_quantum_parallel_16_16_16_block.cu -arch=sm_75 | |
#include <stdio.h> | |
#include <time.h> | |
#include<math.h> | |
void random_array(int len, double *x){ | |
for(int i=0;i<len;i++){ | |
x[i]=rand(); | |
} | |
} | |
void load_matrix_from_file(FILE * file, int nb_rows,int nb_cols, double* mat){ | |
for(int i = 0; i < nb_rows; i++){ | |
for(int j = 0; j < nb_cols; j++){ | |
//Use lf format specifier, %c is for character | |
//if (!fscanf(file, "%lf", &mat[i][j])){ | |
int index=i*nb_cols + j; | |
if (!fscanf(file, "%lf", &mat[index])){ | |
break; | |
} | |
} | |
} | |
} | |
void load_matrix_from_file_2(FILE * file, int n, double* mat){ | |
for(int j = 0; j < n; j++){ | |
if (!fscanf(file, "%lf", &mat[j])){ | |
break; | |
} | |
} | |
} | |
void print_matrix(int nb_rows, int nb_cols, double * mat){ | |
for(int i = 0; i < nb_rows; i++){ | |
for(int j = 0; j < nb_cols; j++){ | |
int index = i*nb_cols+j; | |
printf("%d %lf\n",index,mat[index]); | |
} | |
} | |
} | |
void in_ma_tran(int len, double *x){ | |
for(int i=0;i<len;i++){ | |
printf("element %d : %f\n",i,x[i]); | |
} | |
} | |
int tong_ma_tran(int len, double *x){ | |
double total=0; | |
for(int i=0;i<len;i++){ | |
total+=x[i]; | |
} | |
return total; | |
} | |
__global__ void map_3d_array(int N, double *C, double *B, double *E, double *result, double *result_2){ | |
int r = blockIdx.x*blockDim.x+ threadIdx.x; | |
int s = blockIdx.y*blockDim.y+ threadIdx.y; | |
int q = blockIdx.z*blockDim.z+ threadIdx.z; | |
if(r < N && s<N && q<N){ | |
atomicAdd(result_2, C[r*N+s]*B[s*N+q]*(cos(E[q]-E[r])-cos(E[s]-E[q]))/(10+E[s]-E[q]/2-E[r]/2)); | |
//atomicAdd(result_2,r+s+q); | |
//result[r+s*N+q*N*N]=(double)(r+s+q); | |
} | |
} | |
int main(){ | |
int N =1000; | |
double *h_C, *h_B, *h_E, *h_result,*h_result_2; | |
double *d_C, *d_B, *d_E, *d_result, *d_result_2; | |
//khai bao vung nho trong host cho C,B,E, result | |
h_C = (double*)malloc(N*N*sizeof(double)); | |
h_B = (double*)malloc(N*N*sizeof(double)); | |
h_E = (double*)malloc(N*sizeof(double)); | |
h_result = (double*)malloc(N*N*N*sizeof(double)); | |
h_result_2 = (double*)malloc(1*sizeof(double)); | |
//khai bao vung nho trong device cho C,B,E, result | |
cudaMalloc(&d_C, N*N*sizeof(double)); | |
cudaMalloc(&d_B, N*N*sizeof(double)); | |
cudaMalloc(&d_E, N*sizeof(double)); | |
cudaMalloc(&d_result, N*N*N*sizeof(double)); | |
cudaMalloc(&d_result_2, 1*sizeof(double)); | |
//khoi tao gia tri cho h_C,h_B,h_E | |
random_array(N*N,h_C); | |
random_array(N*N,h_B); | |
FILE *file; | |
file=fopen("C_1000_1000.txt", "r"); | |
int nb_rows=N; //ma trix nb_rows * nb_cols | |
int nb_cols=N; | |
load_matrix_from_file(file, nb_rows, nb_cols,h_C); | |
fclose(file); | |
FILE *file2; | |
file2=fopen("B_1000_1000.txt", "r"); | |
load_matrix_from_file(file2, N,N,h_B); | |
fclose(file2); | |
FILE *file3; | |
file3=fopen("E_1000.txt", "r"); | |
load_matrix_from_file_2(file3, 1000,h_E); | |
fclose(file3); | |
printf("E[100]: %lf \n",h_E[100]); | |
//random_array(N,h_E); | |
//=========START======= | |
clock_t begin=clock(); | |
//copy value from host to devide: C,B,E | |
cudaMemcpy(d_C, h_C, N*N*sizeof(double), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B, h_B, N*N*sizeof(double), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_E, h_E, N*sizeof(double), cudaMemcpyHostToDevice); | |
//excute kernel | |
int nb_hyper=N;//16 | |
nb_hyper=8; | |
dim3 nb_block(int(N/nb_hyper)+1,int(N/nb_hyper)+1,int(N/nb_hyper)+1); | |
dim3 nb_thread_per_block(nb_hyper, nb_hyper, nb_hyper); | |
map_3d_array<<<nb_block, nb_thread_per_block>>>(N, d_C, d_B, d_E,d_result,d_result_2); | |
//copy result from device to host | |
cudaMemcpy(h_result, d_result, N*N*N*sizeof(double), cudaMemcpyDeviceToHost); | |
cudaMemcpy(h_result_2, d_result_2, sizeof(double), cudaMemcpyDeviceToHost); | |
//in_ma_tran(N*N*N, h_result); | |
//double total=tong_ma_tran(N*N*N, h_result); | |
//printf("Tong cac phan tu cua ma tran la %f\n",total) ; | |
printf("Tong cac phan tu cua ma tran la (tinh tren gpu) %f\n",h_result_2[0]); | |
//cudaFree(d_C); | |
//cudaFree(d_B); | |
//cudaFree(d_E); | |
cudaFree(d_result); | |
free(h_C); | |
free(h_B); | |
free(h_E); | |
free(h_result); | |
clock_t end = clock(); | |
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC; | |
printf("Total time:%f\n",time_spent); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment