Mohammad-debug/energyweights.cu

## energyweights.cu
#include<iostream>
#include<math.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "cuda_runtime.h"
#include <malloc.h>
#include "device_launch_parameters.h"

using namespace std;
#define PI 3.14

#define real double

#define size 100
//note size should be equal to nz nx ny

//cuda error check
#define cudaCheckError(code)                                                   \
    {                                                                          \
        if ((code) != cudaSuccess)                                             \
        {                                                                      \
            fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
                    cudaGetErrorString(code));                                 \
        }                                                                      \
    }


////////allocate 2d
void allot(real **&preal, const int dim1, const int dim2)
{
    // Contiguous allocation of 2D arrays

    preal = new real *[dim1];
    preal[0] = new real[dim1 * dim2];
    for (int i = 1; i < dim1; i++)
        preal[i] = preal[i - 1] + dim2;

    for (int i = 0; i < dim1; i++)
    {
        for (int j = 0; j < dim2; j++)
        {
            preal[i][j] = 0;
        }
    }
}
/////////allocate 2d


////////////////////////////3d random////////////
void random_generator(double ***&a,int x,int y,int z)
{
 for (int i = 0; i < x; i++)
    {for(int j=0;j<y;j++)
        {for(int k=0;k<z;k++)
            {
                a[i][j][k]=rand()%100;
                //a[i][j][k]=i+j+k;


            }

        }
    }
}

///////////////3d random ends///////////////////

///////2d random
void random_generator(double **&a,int x,int y)
{

    for(int j=0;j<x;j++)
        {for(int k=0;k<y;k++)
            {
                a[j][k]=rand()%100;
                 //a[j][k]=j+k;

            }

        }

}


//////2d random


//3d allocation

void allot(real ***&pointer, int dim1, int dim2, int dim3) {

  pointer = new real **[dim1];
  pointer[0] = new real*[dim1 * dim2];
  pointer[0][0] = new real[dim1 * dim2 * dim3];

  for (int i = 0; i < dim1; i++) {
    if (i > 0) {
      pointer[i] = pointer[i - 1] + dim2;
      pointer[i][0] = pointer[i - 1][0] + dim2 * dim3;
    }

    for (int j = 1; j < dim2; j++) {
      pointer[i][j] = pointer[i][j - 1] + dim3;
    }
  }


  for (int i = 0; i < dim1; i++)
    {for(int j=0;j<dim2;j++)
        {for(int k=0;k<dim3;k++)
            {
                pointer[i][j][k]=0;
                //a[i][j][k]=i+j+k;


            }

        }
    }

  //initialise to 0
}


void energy_weights2(
    // Energy Weights (forward and reverse)
    real **&We, real **&We_adj,
    // space snap parameters
    int snap_z1, int snap_z2, int snap_x1, int snap_x2){
    // Scale gradients to the Energy Weight
    // We: input as forward energy weight, and output as combined energy weight

    real max_We = 0;
    real max_w1 = 0, max_w2=0;
    real epsilon_We = 0.005;
    for (int iz=snap_z1;iz<snap_z2;iz++){
        for (int ix=snap_x1;ix<snap_x2;ix++){
            if (We[iz][ix] > max_w1){
                max_w1 = We[iz][ix];
            }
            if (We_adj[iz][ix] > max_w2){
                max_w2 = We_adj[iz][ix];
            }
            We[iz][ix] = sqrt(We[iz][ix]*We_adj[iz][ix]);

        }
    }

    // Finding maximum of the energy weight
    for (int iz=snap_z1;iz<snap_z2;iz++){
        for (int ix=snap_x1;ix<snap_x2;ix++){

            // Estimate maximum energy weight in CPU
            if (We[iz][ix] > max_We){
                max_We = We[iz][ix];
            }

        }

    }

    // Regularize energy weight to avoid division by zero
    for (int iz=snap_z1;iz<snap_z2;iz++){
        for (int ix=snap_x1;ix<snap_x2;ix++){

            We[iz][ix] += epsilon_We *  max_We;
        }

    }
    std::cout << "Max. Energy Weight = " << max_We << std::endl;
   // std::cout << "Max. Energy part = " << max_w1<<", "<< max_w2 << std::endl;
}

__global__ void cuda_energy_weights2_GPU(
    // Energy Weights (forward and reverse)
    real *We, real *We_adj,
    // space snap parameters
    int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx)
    {
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iz = blockIdx.y * blockDim.y + threadIdx.y;

    if( iz>=snap_z1 && iz<snap_z2 && ix>=snap_x1 && ix<snap_x2)
    {
        We[iz*nx+ix] = sqrt(We[iz*nx + ix]*We_adj[iz*nx+ix]);
    }

    }


__global__ void cuda_energy_weights2_GPU2(
    // Energy Weights (forward and reverse)
    real *We,
    // space snap parameters
    int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx, real epsilon_We, real max_We )
    {
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iz = blockIdx.y * blockDim.y + threadIdx.y;

    if( iz>=snap_z1 && iz<snap_z2 && ix>=snap_x1 && ix<snap_x2)
    {
       We[iz*nx+ix] += epsilon_We *  max_We;
    }

}

void energy_weights2_GPU(
    // Energy Weights (forward and reverse)
    real *&We, real *&We_adj,
    // space snap parameters
    int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx){
    // Scale gradients to the Energy Weight
    // We: input as forward energy weight, and output as combined energy weight

    real max_We = 0;
    real max_w1 = 0, max_w2=0;
    real epsilon_We = 0.005;

    int box1 = 32, box2 = 32;
    dim3 threadsPerBlock(box1, box2);
    dim3 blocksPerGrid(snap_x2 / box1 + 1, snap_z2 / box2 + 1);
        cuda_energy_weights2_GPU<<<blocksPerGrid, threadsPerBlock>>>
        ( We,We_adj, snap_z1, snap_z2, snap_x1,snap_x2,  nx);

  cudaCheckError(cudaDeviceSynchronize());
    thrust::device_ptr<real> dev_ptr1 = thrust::device_pointer_cast(We);
   // thrust::device_ptr<real> dev_ptr2 = thrust::device_pointer_cast(We_adj);


    for (int iz = snap_z1; iz < snap_z2; iz++)
    {
        real ma = thrust::reduce(dev_ptr1 +iz*nx+snap_x1 , dev_ptr1 + iz * nx + snap_x2, 0.0, thrust::maximum<real>());
         if(ma>max_We)
         max_We=ma;

    }
       cuda_energy_weights2_GPU2<<<blocksPerGrid, threadsPerBlock>>>
       ( We, snap_z1, snap_z2, snap_x1,snap_x2,  nx, epsilon_We, max_We );
    cudaCheckError(cudaDeviceSynchronize());
    // for (int iz=snap_z1;iz<snap_z2;iz++){
    //     for (int ix=snap_x1;ix<snap_x2;ix++){
    //         if (We[iz][ix] > max_w1){
    //             max_w1 = We[iz][ix];
    //         }
    //         if (We_adj[iz][ix] > max_w2){
    //             max_w2 = We_adj[iz][ix];
    //         }
    //         We[iz][ix] = sqrt(We[iz][ix]*We_adj[iz][ix]);

    //     }
    // }

    // // Finding maximum of the energy weight
    // for (int iz=snap_z1;iz<snap_z2;iz++){
    //     for (int ix=snap_x1;ix<snap_x2;ix++){

    //         // Estimate maximum energy weight in CPU
    //         if (We[iz][ix] > max_We){
    //             max_We = We[iz][ix];
    //         }

    //     }

    // }

    // // Regularize energy weight to avoid division by zero
    // for (int iz=snap_z1;iz<snap_z2;iz++){
    //     for (int ix=snap_x1;ix<snap_x2;ix++){

    //         We[iz][ix] += epsilon_We *  max_We;
    //     }

    // }
    std::cout << "Max. Energy Weight = " << max_We << std::endl;
    //std::cout << "Max. Energy part = " << max_w1<<", "<< max_w2 << std::endl;
}


int main()
{
    int nz=20,nt=10,nx=15;

      real dt=rand()%10;
     int tf=1;

     int snap_dt=1;
     int snap_z1=1;
     int snap_z2=9;
    int snap_x1=1;
    int snap_x2=4;
    int snap_dz=2;
    int snap_dx=2;

        int snap_nt = 1 + (nt-1)/snap_dt;
        int snap_nz = 1 + (snap_z2 - snap_z1)/snap_dz;
        int snap_nx = 1 + (snap_x2 - snap_x1)/snap_dx;


    real **We;allot(We,nz,nx);random_generator(We,nz,nx);
    real **We_adj;allot(We_adj,nz,nx);random_generator(We_adj,nz,nx);


    //for device
    real *d_We_adj,*d_We;
    //for devicee
    //malloc device//
    cudaCheckError(cudaMalloc(&d_We_adj, nz * nx * sizeof(real)));//device allocation
    cudaCheckError(cudaMalloc(&d_We, nz * nx * sizeof(real)));//device allocation

    //malloc device//
    //cuda memory copy//
    cudaCheckError(cudaMemcpy(d_We_adj, We_adj[0], nz * nx * sizeof(real), cudaMemcpyHostToDevice));
    cudaCheckError(cudaMemcpy(d_We, We[0], nz * nx * sizeof(real), cudaMemcpyHostToDevice));
    //cuda memory copy//

energy_weights2(
    // Energy Weights (forward and reverse)
    We, We_adj,
    // space snap parameters
   snap_z1,snap_z2,  snap_x1, snap_x2);

 energy_weights2_GPU(
    // Energy Weights (forward and reverse)
    d_We, d_We_adj,
    // space snap parameters
     snap_z1,  snap_z2,  snap_x1, snap_x2,  nx);


real **test_We;
//real ***t_accu_sxx; allot(t_accu_sxx, snap_nt, snap_nz, snap_nx);

allot(test_We,nz,nx);

cudaCheckError(cudaMemcpy(test_We[0], d_We, nz * nx * sizeof(real), cudaMemcpyDeviceToHost));

 for (int i = 0; i < nz; i++)
    {for(int j=0;j<nx;j++)


                 if (abs(test_We[i][j] - We[i][j] ) > 0.0000000001)
            {
                cout << "Failed at" << i << " " << j << " " << test_We[i][j] << " " << We[i][j] << "\n";
            }

    }


std::cout<<"done"<<std::endl;

}
	#include<iostream>
	#include<math.h>
	#include <cuda_runtime_api.h>
	#include <stdio.h>
	#include <thrust/host_vector.h>
	#include <thrust/device_vector.h>
	#include "cuda_runtime.h"
	#include <malloc.h>
	#include "device_launch_parameters.h"

	using namespace std;
	#define PI 3.14

	#define real double

	#define size 100
	//note size should be equal to nz nx ny

	//cuda error check
	#define cudaCheckError(code) \
	{ \
	if ((code) != cudaSuccess) \
	{ \
	fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
	cudaGetErrorString(code)); \
	} \
	}


	////////allocate 2d
	void allot(real **&preal, const int dim1, const int dim2)
	{
	// Contiguous allocation of 2D arrays

	preal = new real *[dim1];
	preal[0] = new real[dim1 * dim2];
	for (int i = 1; i < dim1; i++)
	preal[i] = preal[i - 1] + dim2;

	for (int i = 0; i < dim1; i++)
	{
	for (int j = 0; j < dim2; j++)
	{
	preal[i][j] = 0;
	}
	}
	}
	/////////allocate 2d


	////////////////////////////3d random////////////
	void random_generator(double ***&a,int x,int y,int z)
	{
	for (int i = 0; i < x; i++)
	{for(int j=0;j<y;j++)
	{for(int k=0;k<z;k++)
	{
	a[i][j][k]=rand()%100;
	//a[i][j][k]=i+j+k;


	}

	}
	}
	}

	///////////////3d random ends///////////////////

	///////2d random
	void random_generator(double **&a,int x,int y)
	{

	for(int j=0;j<x;j++)
	{for(int k=0;k<y;k++)
	{
	a[j][k]=rand()%100;
	//a[j][k]=j+k;

	}

	}

	}


	//////2d random





	//3d allocation

	void allot(real ***&pointer, int dim1, int dim2, int dim3) {

	pointer = new real **[dim1];
	pointer[0] = new real[dim1 dim2];
	pointer[0][0] = new real[dim1 * dim2 * dim3];

	for (int i = 0; i < dim1; i++) {
	if (i > 0) {
	pointer[i] = pointer[i - 1] + dim2;
	pointer[i][0] = pointer[i - 1][0] + dim2 * dim3;
	}

	for (int j = 1; j < dim2; j++) {
	pointer[i][j] = pointer[i][j - 1] + dim3;
	}
	}


	for (int i = 0; i < dim1; i++)
	{for(int j=0;j<dim2;j++)
	{for(int k=0;k<dim3;k++)
	{
	pointer[i][j][k]=0;
	//a[i][j][k]=i+j+k;


	}

	}
	}

	//initialise to 0
	}


	void energy_weights2(
	// Energy Weights (forward and reverse)
	real &We, real &We_adj,
	// space snap parameters
	int snap_z1, int snap_z2, int snap_x1, int snap_x2){
	// Scale gradients to the Energy Weight
	// We: input as forward energy weight, and output as combined energy weight

	real max_We = 0;
	real max_w1 = 0, max_w2=0;
	real epsilon_We = 0.005;
	for (int iz=snap_z1;iz<snap_z2;iz++){
	for (int ix=snap_x1;ix<snap_x2;ix++){
	if (We[iz][ix] > max_w1){
	max_w1 = We[iz][ix];
	}
	if (We_adj[iz][ix] > max_w2){
	max_w2 = We_adj[iz][ix];
	}
	We[iz][ix] = sqrt(We[iz][ix]*We_adj[iz][ix]);

	}
	}

	// Finding maximum of the energy weight
	for (int iz=snap_z1;iz<snap_z2;iz++){
	for (int ix=snap_x1;ix<snap_x2;ix++){

	// Estimate maximum energy weight in CPU
	if (We[iz][ix] > max_We){
	max_We = We[iz][ix];
	}

	}

	}

	// Regularize energy weight to avoid division by zero
	for (int iz=snap_z1;iz<snap_z2;iz++){
	for (int ix=snap_x1;ix<snap_x2;ix++){

	We[iz][ix] += epsilon_We * max_We;
	}

	}
	std::cout << "Max. Energy Weight = " << max_We << std::endl;
	// std::cout << "Max. Energy part = " << max_w1<<", "<< max_w2 << std::endl;
	}

	__global__ void cuda_energy_weights2_GPU(
	// Energy Weights (forward and reverse)
	real We, real We_adj,
	// space snap parameters
	int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx)
	{
	int ix = blockIdx.x * blockDim.x + threadIdx.x;
	int iz = blockIdx.y * blockDim.y + threadIdx.y;

	if( iz>=snap_z1 && iz<snap_z2 && ix>=snap_x1 && ix<snap_x2)
	{
	We[iznx+ix] = sqrt(We[iznx + ix]We_adj[iznx+ix]);
	}

	}



	__global__ void cuda_energy_weights2_GPU2(
	// Energy Weights (forward and reverse)
	real *We,
	// space snap parameters
	int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx, real epsilon_We, real max_We )
	{
	int ix = blockIdx.x * blockDim.x + threadIdx.x;
	int iz = blockIdx.y * blockDim.y + threadIdx.y;

	if( iz>=snap_z1 && iz<snap_z2 && ix>=snap_x1 && ix<snap_x2)
	{
	We[iznx+ix] += epsilon_We max_We;
	}

	}

	void energy_weights2_GPU(
	// Energy Weights (forward and reverse)
	real &We, real &We_adj,
	// space snap parameters
	int snap_z1, int snap_z2, int snap_x1, int snap_x2, int nx){
	// Scale gradients to the Energy Weight
	// We: input as forward energy weight, and output as combined energy weight

	real max_We = 0;
	real max_w1 = 0, max_w2=0;
	real epsilon_We = 0.005;

	int box1 = 32, box2 = 32;
	dim3 threadsPerBlock(box1, box2);
	dim3 blocksPerGrid(snap_x2 / box1 + 1, snap_z2 / box2 + 1);
	cuda_energy_weights2_GPU<<<blocksPerGrid, threadsPerBlock>>>
	( We,We_adj, snap_z1, snap_z2, snap_x1,snap_x2, nx);

	cudaCheckError(cudaDeviceSynchronize());
	thrust::device_ptr<real> dev_ptr1 = thrust::device_pointer_cast(We);
	// thrust::device_ptr<real> dev_ptr2 = thrust::device_pointer_cast(We_adj);


	for (int iz = snap_z1; iz < snap_z2; iz++)
	{
	real ma = thrust::reduce(dev_ptr1 +iznx+snap_x1 , dev_ptr1 + iz nx + snap_x2, 0.0, thrust::maximum<real>());
	if(ma>max_We)
	max_We=ma;

	}
	cuda_energy_weights2_GPU2<<<blocksPerGrid, threadsPerBlock>>>
	( We, snap_z1, snap_z2, snap_x1,snap_x2, nx, epsilon_We, max_We );
	cudaCheckError(cudaDeviceSynchronize());
	// for (int iz=snap_z1;iz<snap_z2;iz++){
	// for (int ix=snap_x1;ix<snap_x2;ix++){
	// if (We[iz][ix] > max_w1){
	// max_w1 = We[iz][ix];
	// }
	// if (We_adj[iz][ix] > max_w2){
	// max_w2 = We_adj[iz][ix];
	// }
	// We[iz][ix] = sqrt(We[iz][ix]*We_adj[iz][ix]);

	// }
	// }

	// // Finding maximum of the energy weight
	// for (int iz=snap_z1;iz<snap_z2;iz++){
	// for (int ix=snap_x1;ix<snap_x2;ix++){

	// // Estimate maximum energy weight in CPU
	// if (We[iz][ix] > max_We){
	// max_We = We[iz][ix];
	// }

	// }

	// }

	// // Regularize energy weight to avoid division by zero
	// for (int iz=snap_z1;iz<snap_z2;iz++){
	// for (int ix=snap_x1;ix<snap_x2;ix++){

	// We[iz][ix] += epsilon_We * max_We;
	// }

	// }
	std::cout << "Max. Energy Weight = " << max_We << std::endl;
	//std::cout << "Max. Energy part = " << max_w1<<", "<< max_w2 << std::endl;
	}


	int main()
	{
	int nz=20,nt=10,nx=15;

	real dt=rand()%10;
	int tf=1;

	int snap_dt=1;
	int snap_z1=1;
	int snap_z2=9;
	int snap_x1=1;
	int snap_x2=4;
	int snap_dz=2;
	int snap_dx=2;

	int snap_nt = 1 + (nt-1)/snap_dt;
	int snap_nz = 1 + (snap_z2 - snap_z1)/snap_dz;
	int snap_nx = 1 + (snap_x2 - snap_x1)/snap_dx;



	real **We;allot(We,nz,nx);random_generator(We,nz,nx);
	real **We_adj;allot(We_adj,nz,nx);random_generator(We_adj,nz,nx);


	//for device
	real d_We_adj,d_We;
	//for devicee
	//malloc device//
	cudaCheckError(cudaMalloc(&d_We_adj, nz * nx * sizeof(real)));//device allocation
	cudaCheckError(cudaMalloc(&d_We, nz * nx * sizeof(real)));//device allocation

	//malloc device//
	//cuda memory copy//
	cudaCheckError(cudaMemcpy(d_We_adj, We_adj[0], nz * nx * sizeof(real), cudaMemcpyHostToDevice));
	cudaCheckError(cudaMemcpy(d_We, We[0], nz * nx * sizeof(real), cudaMemcpyHostToDevice));
	//cuda memory copy//

	energy_weights2(
	// Energy Weights (forward and reverse)
	We, We_adj,
	// space snap parameters
	snap_z1,snap_z2, snap_x1, snap_x2);

	energy_weights2_GPU(
	// Energy Weights (forward and reverse)
	d_We, d_We_adj,
	// space snap parameters
	snap_z1, snap_z2, snap_x1, snap_x2, nx);






	real **test_We;
	//real ***t_accu_sxx; allot(t_accu_sxx, snap_nt, snap_nz, snap_nx);

	allot(test_We,nz,nx);

	cudaCheckError(cudaMemcpy(test_We[0], d_We, nz * nx * sizeof(real), cudaMemcpyDeviceToHost));

	for (int i = 0; i < nz; i++)
	{for(int j=0;j<nx;j++)


	if (abs(test_We[i][j] - We[i][j] ) > 0.0000000001)
	{
	cout << "Failed at" << i << " " << j << " " << test_We[i][j] << " " << We[i][j] << "\n";
	}

	}


	std::cout<<"done"<<std::endl;

	}