Skip to content

Instantly share code, notes, and snippets.

@Mohammad-debug
Created August 1, 2021 06:19
Show Gist options
  • Save Mohammad-debug/0786645a4ea8ed9e0af70dc3a33aceea to your computer and use it in GitHub Desktop.
Save Mohammad-debug/0786645a4ea8ed9e0af70dc3a33aceea to your computer and use it in GitHub Desktop.
#include<iostream>
#include<math.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "cuda_runtime.h"
#include <malloc.h>
#include "device_launch_parameters.h"
using namespace std;
#define PI 3.14
#define real double
#define size 100
//note size should be equal to nz nx ny
//cuda error check
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) \
{ \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
////////allocate 2d
void allot(real **&preal, const int dim1, const int dim2)
{
// Contiguous allocation of 2D arrays
preal = new real *[dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++)
preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++)
{
for (int j = 0; j < dim2; j++)
{
preal[i][j] = 0;
}
}
}
/////////allocate 2d
////////////////////////////3d random////////////
void random_generator(double ***&a,int x,int y,int z)
{
for (int i = 0; i < x; i++)
{for(int j=0;j<y;j++)
{for(int k=0;k<z;k++)
{
a[i][j][k]=rand()%100;
//a[i][j][k]=i+j+k;
}
}
}
}
///////////////3d random ends///////////////////
///////2d random
void random_generator(double **&a,int x,int y)
{
for(int j=0;j<x;j++)
{for(int k=0;k<y;k++)
{
a[j][k]=rand()%100;
//a[j][k]=j+k;
}
}
}
//////2d random
//3d allocation
void allot(real ***&pointer, int dim1, int dim2, int dim3) {
pointer = new real **[dim1];
pointer[0] = new real*[dim1 * dim2];
pointer[0][0] = new real[dim1 * dim2 * dim3];
for (int i = 0; i < dim1; i++) {
if (i > 0) {
pointer[i] = pointer[i - 1] + dim2;
pointer[i][0] = pointer[i - 1][0] + dim2 * dim3;
}
for (int j = 1; j < dim2; j++) {
pointer[i][j] = pointer[i][j - 1] + dim3;
}
}
for (int i = 0; i < dim1; i++)
{for(int j=0;j<dim2;j++)
{for(int k=0;k<dim3;k++)
{
pointer[i][j][k]=0;
//a[i][j][k]=i+j+k;
}
}
}
//initialise to 0
}
void energy_weights2(
// Energy Weights (forward and reverse)
real **&We, real **&We_adj,
// space snap parameters
int snap_z1, int snap_z2, int snap_x1, int snap_x2){
// Scale gradients to the Energy Weight
// We: input as forward energy weight, and output as combined energy weight
real max_We = 0;
real max_w1 = 0, max_w2=0;
real epsilon_We = 0.005;
for (int iz=snap_z1;iz<snap_z2;iz++){
for (int ix=snap_x1;ix<snap_x2;ix++){
if (We[iz][ix] > max_w1){
max_w1 = We[iz][ix];
}
if (We_adj[iz][ix] > max_w2){
max_w2 = We_adj[iz][ix];
}
We[iz][ix] = sqrt(We[iz][ix]*We_adj[iz][ix]);
}
}
// Finding maximum of the energy weight
for (int iz=snap_z1;iz<snap_z2;iz++){
for (int ix=snap_x1;ix<snap_x2;ix++){
// Estimate maximum energy weight in CPU
if (We[iz][ix] > max_We){
max_We = We[iz][ix];
}
}
}
// Regularize energy weight to avoid division by zero
for (int iz=snap_z1;iz<snap_z2;iz++){
for (int ix=snap_x1;ix<snap_x2;ix++){
We[iz][ix] += epsilon_We * max_We;
}
}
std::cout << "Max. Energy Weight = " << max_We << std::endl;
// std::cout << "Max. Energy part = " << max_w1<<", "<< max_w2 << std::endl;
}
__global__ void cuda_energy_weights2_GPU(
// Energy Weights (forward and reverse)
real *We, real *We_adj,
// space snap parameters
int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iz = blockIdx.y * blockDim.y + threadIdx.y;
if( iz>=snap_z1 && iz<snap_z2 && ix>=snap_x1 && ix<snap_x2)
{
We[iz*nx+ix] = sqrt(We[iz*nx + ix]*We_adj[iz*nx+ix]);
}
}
__global__ void cuda_energy_weights2_GPU2(
// Energy Weights (forward and reverse)
real *We,
// space snap parameters
int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx, real epsilon_We, real max_We )
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iz = blockIdx.y * blockDim.y + threadIdx.y;
if( iz>=snap_z1 && iz<snap_z2 && ix>=snap_x1 && ix<snap_x2)
{
We[iz*nx+ix] += epsilon_We * max_We;
}
}
void energy_weights2_GPU(
// Energy Weights (forward and reverse)
real *&We, real *&We_adj,
// space snap parameters
int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx){
// Scale gradients to the Energy Weight
// We: input as forward energy weight, and output as combined energy weight
real max_We = 0;
real max_w1 = 0, max_w2=0;
real epsilon_We = 0.005;
int box1 = 32, box2 = 32;
dim3 threadsPerBlock(box1, box2);
dim3 blocksPerGrid(snap_x2 / box1 + 1, snap_z2 / box2 + 1);
cuda_energy_weights2_GPU<<<blocksPerGrid, threadsPerBlock>>>
( We,We_adj, snap_z1, snap_z2, snap_x1,snap_x2, nx);
cudaCheckError(cudaDeviceSynchronize());
thrust::device_ptr<real> dev_ptr1 = thrust::device_pointer_cast(We);
// thrust::device_ptr<real> dev_ptr2 = thrust::device_pointer_cast(We_adj);
for (int iz = snap_z1; iz < snap_z2; iz++)
{
real ma = thrust::reduce(dev_ptr1 +iz*nx+snap_x1 , dev_ptr1 + iz * nx + snap_x2, 0.0, thrust::maximum<real>());
if(ma>max_We)
max_We=ma;
}
cuda_energy_weights2_GPU2<<<blocksPerGrid, threadsPerBlock>>>
( We, snap_z1, snap_z2, snap_x1,snap_x2, nx, epsilon_We, max_We );
cudaCheckError(cudaDeviceSynchronize());
// for (int iz=snap_z1;iz<snap_z2;iz++){
// for (int ix=snap_x1;ix<snap_x2;ix++){
// if (We[iz][ix] > max_w1){
// max_w1 = We[iz][ix];
// }
// if (We_adj[iz][ix] > max_w2){
// max_w2 = We_adj[iz][ix];
// }
// We[iz][ix] = sqrt(We[iz][ix]*We_adj[iz][ix]);
// }
// }
// // Finding maximum of the energy weight
// for (int iz=snap_z1;iz<snap_z2;iz++){
// for (int ix=snap_x1;ix<snap_x2;ix++){
// // Estimate maximum energy weight in CPU
// if (We[iz][ix] > max_We){
// max_We = We[iz][ix];
// }
// }
// }
// // Regularize energy weight to avoid division by zero
// for (int iz=snap_z1;iz<snap_z2;iz++){
// for (int ix=snap_x1;ix<snap_x2;ix++){
// We[iz][ix] += epsilon_We * max_We;
// }
// }
std::cout << "Max. Energy Weight = " << max_We << std::endl;
//std::cout << "Max. Energy part = " << max_w1<<", "<< max_w2 << std::endl;
}
int main()
{
int nz=20,nt=10,nx=15;
real dt=rand()%10;
int tf=1;
int snap_dt=1;
int snap_z1=1;
int snap_z2=9;
int snap_x1=1;
int snap_x2=4;
int snap_dz=2;
int snap_dx=2;
int snap_nt = 1 + (nt-1)/snap_dt;
int snap_nz = 1 + (snap_z2 - snap_z1)/snap_dz;
int snap_nx = 1 + (snap_x2 - snap_x1)/snap_dx;
real **We;allot(We,nz,nx);random_generator(We,nz,nx);
real **We_adj;allot(We_adj,nz,nx);random_generator(We_adj,nz,nx);
//for device
real *d_We_adj,*d_We;
//for devicee
//malloc device//
cudaCheckError(cudaMalloc(&d_We_adj, nz * nx * sizeof(real)));//device allocation
cudaCheckError(cudaMalloc(&d_We, nz * nx * sizeof(real)));//device allocation
//malloc device//
//cuda memory copy//
cudaCheckError(cudaMemcpy(d_We_adj, We_adj[0], nz * nx * sizeof(real), cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(d_We, We[0], nz * nx * sizeof(real), cudaMemcpyHostToDevice));
//cuda memory copy//
energy_weights2(
// Energy Weights (forward and reverse)
We, We_adj,
// space snap parameters
snap_z1,snap_z2, snap_x1, snap_x2);
energy_weights2_GPU(
// Energy Weights (forward and reverse)
d_We, d_We_adj,
// space snap parameters
snap_z1, snap_z2, snap_x1, snap_x2, nx);
real **test_We;
//real ***t_accu_sxx; allot(t_accu_sxx, snap_nt, snap_nz, snap_nx);
allot(test_We,nz,nx);
cudaCheckError(cudaMemcpy(test_We[0], d_We, nz * nx * sizeof(real), cudaMemcpyDeviceToHost));
for (int i = 0; i < nz; i++)
{for(int j=0;j<nx;j++)
if (abs(test_We[i][j] - We[i][j] ) > 0.0000000001)
{
cout << "Failed at" << i << " " << j << " " << test_We[i][j] << " " << We[i][j] << "\n";
}
}
std::cout<<"done"<<std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment