ivan-pi/Makefile

## example.cpp
// example.cpp
//
// Compile with:
//    icpx -fsycl -O3 -march=native example.cpp
//
#include <vector>
#include <algorithm>
#include <cassert>

#include <cstdio>
#include <string>
#include <iostream>
#include <chrono>

#include <CL/sycl.hpp>

const int N = 512;

const int NX = N;
const int NY = N;

const int num_steps = 50000;
const int num_steps_io = 10000;

const double U0 = 0.1 / std::sqrt(3);
const double nu = U0 * (double) N / 3000;

const double tau = 3*nu + 0.5;
const double omega = 1./tau;

const double w0 = 4./9.;
const double ws = 1./9.;
const double wd = 1./36.;

const std::array<double,9> w{w0,ws,ws,ws,ws,wd,wd,wd,wd};

const double PI = 4*std::atan(1.0);

struct shearlayer {
    double length;
    double kappa;
    double delta;
    double umax;

    std::array<double,2> operator()(std::array<double,2> xy) {

        const double x = xy[0]/length;
        const double y = xy[1]/length;

        const double ux = y < 0.5 ?
            umax*std::tanh(kappa*(y - 0.25)) : umax*std::tanh(kappa*(0.75 - y));
        const double uy = umax*delta*std::sin(2.0*PI*(x + 0.25));

        return {ux,uy};
    }
};

struct shearlayer layer{N,80.,0.05,U0};

template<typename T>
void writeVTK_rectilinear(
        const char* filepath,
        int nx, int ny,
        const T rho[],
        const T ux[], const T uy[]) {

    std::ofstream vtkfile(filepath);

    vtkfile << "# vtk DataFile Version 3.0\n";
    vtkfile << "lbm grid output\n";
    vtkfile << "ASCII\n";

#if 0
    vtkfile << "DATASET RECTILINEAR_GRID\n";

    vtkfile << "DIMENSIONS " << nx+1 << ' ' << ny+1 << " 2\n";

    vtkfile << "X_COORDINATES " << nx+1 << " float\n";
    for (int ix = 0; ix <= nx; ++ix) {
        vtkfile << (T) ix << '\n';
    }

    vtkfile << "Y_COORDINATES " << ny+1 << " float\n";
    for (int iy = 0; iy <= ny; ++iy) {
        vtkfile << (T) iy << '\n';
    }

    vtkfile << "Z_COORDINATES 2 float\n";
    vtkfile << (T) 0.0 << '\n';
    vtkfile << (T) 1.0 << '\n';
#else
    vtkfile << "DATASET STRUCTURED_POINTS\n";
    vtkfile << "DIMENSIONS " << nx << ' ' << ny << " 1\n";
    vtkfile << "ORIGIN 0.5 0.5 0.5\n";
    vtkfile << "SPACING 1.0 1.0 0.0\n";
#endif
    vtkfile << '\n';
    vtkfile << "POINT_DATA " << nx*ny << '\n';
    vtkfile << "SCALARS Density float 1\n";
    vtkfile << "LOOKUP_TABLE default\n";
    for (int iy = 0; iy < ny; ++iy) {
        for (int ix = 0; ix < nx; ++ix) {
            vtkfile << rho[ix*ny + iy] << '\n';
        }
    }
    vtkfile << "VECTORS Velocity float\n";
    for (int iy = 0; iy < ny; ++iy) {
        for (int ix = 0; ix < nx; ++ix) {
            vtkfile << ux[ix*ny+iy] << ' ' << uy[ix*nx+iy] << " 0\n";
        }
    }
    vtkfile.close();
}

void set_equilibrium(sycl::queue &queue,
              sycl::buffer<double,1> brho,
              sycl::buffer<double,2> bvel,
              sycl::buffer<double,2> bf) {

    queue.submit([&](sycl::handler& h){

        sycl::accessor f(bf,h,sycl::write_only);
        sycl::accessor rho(brho,h,sycl::read_only);
        sycl::accessor vel(bvel,h,sycl::read_only);

        h.parallel_for(sycl::range{NX,NY},[=](sycl::id<2> id){

            const std::size_t i = id[0]*NY + id[1];

            const double ux = vel[0][i];
            const double uy = vel[1][i];
            const double usqr = ux*ux + uy*uy;

            double cu[9];
            cu[0] =   0;
            cu[1] =  ux;
            cu[2] =       uy;
            cu[3] = -ux;
            cu[4] =      -uy;
            cu[5] =  ux + uy;
            cu[6] = -ux + uy;
            cu[7] = -ux - uy;
            cu[8] =  ux - uy;

            for (int q = 0; q < 9; ++q) {
                f[q][i] = rho[i]*w[q]*(1 + 3*cu[q] + 4.5*cu[q]*cu[q] - 1.5*usqr);
            }
        });
    });

    queue.wait();
}

void macros(sycl::queue &Q,
        sycl::buffer<double,2> b_f,
        sycl::buffer<double,1> b_rho,
        sycl::buffer<double,2> b_vel) {

    Q.submit([&](sycl::handler& h) {

        sycl::accessor f(b_f,h,sycl::read_only);
        sycl::accessor rho(b_rho,h,sycl::write_only);
        sycl::accessor vel(b_vel,h,sycl::write_only);

        h.parallel_for<class macro>(NX*NY,[=](sycl::id<1> i) {

            const double rho_ =
                     ((f[5][i] + f[7][i]) + (f[6][i] + f[8][i])) +
                     ((f[1][i] + f[3][i]) + (f[2][i] + f[4][i])) + f[0][i];

            rho[i] = rho_;

            const double invrho = 1.0/rho_;

            vel[0][i] = invrho * (((f[5][i] - f[7][i]) + (f[8][i] - f[6][i])) + (f[1][i] - f[3][i]));
            vel[1][i] = invrho * (((f[5][i] - f[7][i]) + (f[6][i] - f[8][i])) + (f[2][i] - f[4][i]));

        });

    });

}

void collide(sycl::queue &Q,

        sycl::buffer<double,2> b_f,
        sycl::buffer<double,1> b_rho,
        sycl::buffer<double,2> b_vel,
        double omega) {

    Q.submit([&](sycl::handler &h) {

        //sycl::accessor rho(b_rho,h,sycl::read_only);
        //sycl::accessor vel(b_vel,h,sycl::read_only);

        sycl::accessor f(b_f,h,sycl::read_write);

        const double w[3] = {w0, ws, wd};
        const double omegabar = 1.0 - omega;
        const double one_third = 1.0 / 3.0;
        const double one_half = 1.0 / 2.0;

        const double omega_w0 = 3.0 * omega * w[0];
        const double omega_ws = 3.0 * omega * w[1];
        const double omega_wd = 3.0 * omega * w[2];

        h.parallel_for<class collision>(NX*NY,[=](sycl::id<1> i) {

            // --- constants ---


            // --- common terms and direction 0 ---

            const double rho =
                     ((f[5][i] + f[7][i]) + (f[6][i] + f[8][i])) +
                     ((f[1][i] + f[3][i]) + (f[2][i] + f[4][i])) + f[0][i];

            const double invrho = 1.0/rho;

            const double velx = invrho * (((f[5][i] - f[7][i]) + (f[8][i] - f[6][i])) + (f[1][i] - f[3][i]));
            const double vely = invrho * (((f[5][i] - f[7][i]) + (f[6][i] - f[8][i])) + (f[2][i] - f[4][i]));

            const double velxx = velx*velx;
            const double velyy = vely*vely;

            const double indp = one_third - one_half * (velxx + velyy);

            f[0][i] = omegabar * f[0][i]  +  omega_w0 * rho * indp;

            // ------

            const double vel_trm_13 = indp + (3./2.)*velxx;

            f[1][i] = omegabar * f[1][i]  +  omega_ws*rho*(vel_trm_13 + velx);
            f[3][i] = omegabar * f[3][i]  +  omega_ws*rho*(vel_trm_13 - velx);

            // ------

            const double vel_trm_24 = indp + (3./2.)*velyy;

            f[2][i] = omegabar * f[2][i]  +  omega_ws * rho * (vel_trm_24 + vely);
            f[4][i] = omegabar * f[4][i]  +  omega_ws * rho * (vel_trm_24 - vely);

            // ------

            const double velxpy = velx + vely;
            const double vel_trm_57 = indp + (3./2.) * velxpy * velxpy;

            f[5][i] = omegabar * f[5][i] + omega_wd * rho * (vel_trm_57 + velxpy);
            f[7][i] = omegabar * f[7][i] + omega_wd * rho * (vel_trm_57 - velxpy);

            // ------

            const double velxmy = velx - vely;
            const double vel_trm_68 = indp + (3./2.) * velxmy * velxmy;

            f[6][i] = omegabar * f[6][i] + omega_wd * rho * (vel_trm_68 - velxmy);
            f[8][i] = omegabar * f[8][i] + omega_wd * rho * (vel_trm_68 + velxmy);

        });

    });

}

void stream(sycl::queue &Q,
        sycl::buffer<double,2> bfold,
        sycl::buffer<double,2> bfnew) {

    Q.submit([&](sycl::handler& h) {

        sycl::accessor fold(bfold,h,sycl::read_only);
        sycl::accessor fnew(bfnew,h,sycl::write_only);

        h.parallel_for(sycl::range{NX,NY},[=](sycl::id<2> id) {

            auto ix = id[0];
            auto iy = id[1];

            size_t xp1 = (ix + 1) % NX;
            size_t xm1 = (ix + NX - 1) % NX;

            size_t yp1 = (iy + 1) % NY;
            size_t ym1 = (iy + NY - 1) % NY;

            size_t curr = ix*NY + iy;

            fnew[0][curr] = fold[0][curr];         /* central cell, no movement */
#if 1
            fnew[1][curr] = fold[1][iy + xp1*NY];  /* east */
            fnew[2][curr] = fold[2][yp1 + ix*NY];  /* north */
            fnew[3][curr] = fold[3][iy + xm1*NY];  /* west */
            fnew[4][curr] = fold[4][ym1 + ix*NY];  /* south */

            fnew[5][curr] = fold[5][yp1 + xp1*NY]; /* north-east */
            fnew[6][curr] = fold[6][yp1 + xm1*NY]; /* north-west */
            fnew[7][curr] = fold[7][ym1 + xm1*NY]; /* south-west */
            fnew[8][curr] = fold[8][ym1 + xp1*NY]; /* south-east */
#else
            fnew[1][iy + xm1*NY] = fold[1][curr];  /* east */
            fnew[2][ym1 + ix*NY] = fold[2][curr];  /* north */
            fnew[3][iy + xp1*NY] = fold[3][curr];  /* west */
            fnew[4][yp1 + ix*NY] = fold[4][curr];  /* south */

            fnew[5][ym1 + ym1*NY] = fold[5][curr]; /* north-east */
            fnew[6][ym1 + xp1*NY] = fold[6][curr]; /* north-west */
            fnew[7][yp1 + xp1*NY] = fold[7][curr]; /* south-west */
            fnew[8][yp1 + xm1*NY] = fold[8][curr]; /* south-east */
#endif
        });

    });

}


std::string filename(int n, std::string basename) {
    char buffer[13];
    std::snprintf(buffer, sizeof(buffer), "%08d.vtk", n);
    return basename.append(buffer);
}

int main(int argc, char const *argv[])
{

    sycl::device my_dev{sycl::default_selector{}};
    std::cout << "Running tests on " << my_dev.get_info<sycl::info::device::name>() << ".\n";

    std::vector<double> fold(NX*NY*9);
    std::vector<double> fnew(NX*NY*9);

    std::vector<double> rho(NX*NY);
    std::vector<double> vel(NX*NY*2);

    // Initial density
    std::fill(rho.begin(),rho.end(),1.0);

    // Initial velocity
    {
        double *vx = &vel[0];
        double *vy = &vel[NX*NY];
        for (int ix = 0; ix < NX; ++ix) {
            for (int iy = 0; iy < NY; ++iy) {
                const double x = (double) ix + 0.5;
                const double y = (double) iy + 0.5;
                auto vxy = layer({x,y});
                vx[ix*NY + iy] = vxy[0];
                vy[ix*NY + iy] = vxy[1];
            }
        }

        //writeVTK_rectilinear("begin.vtk",NX,NY,rho.data(),vx,vy);
    }


    // Catch asynchronous exceptions
    auto exception_handler = [](sycl::exception_list exceptions) {
        for (std::exception_ptr const &e : exceptions) {
            try {
                std::rethrow_exception(e);
            }
            catch (sycl::exception const &e) {
                std::cout << "Caught asynchronous SYCL "
                             "exception during execution:\n"
                          << e.what() << std::endl;
            }
        }
    };

    auto start = std::chrono::steady_clock::now();

    { /* kernel region */

        sycl::queue Q(my_dev, exception_handler);

        sycl::buffer<double,2> b_fold{fold.data(),{9,NX*NY}};
        sycl::buffer<double,2> b_fnew{fnew.data(),{9,NX*NY}};

        sycl::buffer<double,1> b_rho{rho.data(),{NX*NY}};
        sycl::buffer<double,2> b_vel{vel.data(),{2,NX*NY}};

        set_equilibrium(Q,b_rho,b_vel,b_fold);

        for (int step = 0; step < num_steps; ++step) {

            if (step % 2 == 0) {
                //macros(Q,b_fold,b_rho,b_vel);
                collide(Q,b_fold,b_rho,b_vel,omega);
                stream(Q,b_fold,b_fnew);
            } else {
                //macros(Q,b_fnew,b_rho,b_vel);
                collide(Q,b_fnew,b_rho,b_vel,omega);
                stream(Q,b_fnew,b_fold);
            }

            if (step % num_steps_io == 0 && num_steps_io > 0) {

                if (step % 2 == 0) {
                    macros(Q,b_fold,b_rho,b_vel);
                } else {
                    macros(Q,b_fnew,b_rho,b_vel);
                }

                sycl::host_accessor h_rho(b_rho);
                sycl::host_accessor h_vel(b_vel);

                double *hvx = h_vel.get_pointer();
                double *hvy = hvx + NX*NY;

                auto file = filename(step,"lbm");

                writeVTK_rectilinear(file.c_str(),NX,NY,h_rho.get_pointer(),hvx,hvy);
            }

        }

    } /* end kernel region */

    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> elapsed_seconds = end - start;

    std::cout << "MLUPS = " << double(NX)*double(NY)*double(num_steps)*1.e-6/elapsed_seconds.count() << '\n';

#if 0
    {
        double *r = rho.data();
        double *vx = &vel[0];
        double *vy = &vel[NX*NY];
        writeVTK_rectilinear("end.vtk",NX,NY,r,vx,vy);
    }
#endif

    return 0;
}

## Makefile
CXX=icpx
CXXFLAGS=-Wall -g -O2 -march=native

.PHONY: all
all: example

example: example.cpp
	$(CXX) -o $@ $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 $<

.PHONY: clean
clean:
	rm -f example
	// example.cpp
	//
	// Compile with:
	// icpx -fsycl -O3 -march=native example.cpp
	//
	#include <vector>
	#include <algorithm>
	#include <cassert>

	#include <cstdio>
	#include <string>
	#include <iostream>
	#include <chrono>

	#include <CL/sycl.hpp>

	const int N = 512;

	const int NX = N;
	const int NY = N;

	const int num_steps = 50000;
	const int num_steps_io = 10000;

	const double U0 = 0.1 / std::sqrt(3);
	const double nu = U0 * (double) N / 3000;

	const double tau = 3*nu + 0.5;
	const double omega = 1./tau;

	const double w0 = 4./9.;
	const double ws = 1./9.;
	const double wd = 1./36.;

	const std::array<double,9> w{w0,ws,ws,ws,ws,wd,wd,wd,wd};

	const double PI = 4*std::atan(1.0);

	struct shearlayer {
	double length;
	double kappa;
	double delta;
	double umax;

	std::array<double,2> operator()(std::array<double,2> xy) {

	const double x = xy[0]/length;
	const double y = xy[1]/length;

	const double ux = y < 0.5 ?
	umaxstd::tanh(kappa(y - 0.25)) : umaxstd::tanh(kappa(0.75 - y));
	const double uy = umaxdeltastd::sin(2.0PI(x + 0.25));

	return {ux,uy};
	}
	};

	struct shearlayer layer{N,80.,0.05,U0};

	template<typename T>
	void writeVTK_rectilinear(
	const char* filepath,
	int nx, int ny,
	const T rho[],
	const T ux[], const T uy[]) {

	std::ofstream vtkfile(filepath);

	vtkfile << "# vtk DataFile Version 3.0\n";
	vtkfile << "lbm grid output\n";
	vtkfile << "ASCII\n";

	#if 0
	vtkfile << "DATASET RECTILINEAR_GRID\n";

	vtkfile << "DIMENSIONS " << nx+1 << ' ' << ny+1 << " 2\n";

	vtkfile << "X_COORDINATES " << nx+1 << " float\n";
	for (int ix = 0; ix <= nx; ++ix) {
	vtkfile << (T) ix << '\n';
	}

	vtkfile << "Y_COORDINATES " << ny+1 << " float\n";
	for (int iy = 0; iy <= ny; ++iy) {
	vtkfile << (T) iy << '\n';
	}

	vtkfile << "Z_COORDINATES 2 float\n";
	vtkfile << (T) 0.0 << '\n';
	vtkfile << (T) 1.0 << '\n';
	#else
	vtkfile << "DATASET STRUCTURED_POINTS\n";
	vtkfile << "DIMENSIONS " << nx << ' ' << ny << " 1\n";
	vtkfile << "ORIGIN 0.5 0.5 0.5\n";
	vtkfile << "SPACING 1.0 1.0 0.0\n";
	#endif
	vtkfile << '\n';
	vtkfile << "POINT_DATA " << nx*ny << '\n';
	vtkfile << "SCALARS Density float 1\n";
	vtkfile << "LOOKUP_TABLE default\n";
	for (int iy = 0; iy < ny; ++iy) {
	for (int ix = 0; ix < nx; ++ix) {
	vtkfile << rho[ix*ny + iy] << '\n';
	}
	}
	vtkfile << "VECTORS Velocity float\n";
	for (int iy = 0; iy < ny; ++iy) {
	for (int ix = 0; ix < nx; ++ix) {
	vtkfile << ux[ixny+iy] << ' ' << uy[ixnx+iy] << " 0\n";
	}
	}
	vtkfile.close();
	}

	void set_equilibrium(sycl::queue &queue,
	sycl::buffer<double,1> brho,
	sycl::buffer<double,2> bvel,
	sycl::buffer<double,2> bf) {

	queue.submit([&](sycl::handler& h){

	sycl::accessor f(bf,h,sycl::write_only);
	sycl::accessor rho(brho,h,sycl::read_only);
	sycl::accessor vel(bvel,h,sycl::read_only);

	h.parallel_for(sycl::range{NX,NY},[=](sycl::id<2> id){

	const std::size_t i = id[0]*NY + id[1];

	const double ux = vel[0][i];
	const double uy = vel[1][i];
	const double usqr = uxux + uyuy;

	double cu[9];
	cu[0] = 0;
	cu[1] = ux;
	cu[2] = uy;
	cu[3] = -ux;
	cu[4] = -uy;
	cu[5] = ux + uy;
	cu[6] = -ux + uy;
	cu[7] = -ux - uy;
	cu[8] = ux - uy;

	for (int q = 0; q < 9; ++q) {
	f[q][i] = rho[i]w[q](1 + 3cu[q] + 4.5cu[q]cu[q] - 1.5usqr);
	}
	});
	});

	queue.wait();
	}

	void macros(sycl::queue &Q,
	sycl::buffer<double,2> b_f,
	sycl::buffer<double,1> b_rho,
	sycl::buffer<double,2> b_vel) {

	Q.submit([&](sycl::handler& h) {

	sycl::accessor f(b_f,h,sycl::read_only);
	sycl::accessor rho(b_rho,h,sycl::write_only);
	sycl::accessor vel(b_vel,h,sycl::write_only);

	h.parallel_for<class macro>(NX*NY,[=](sycl::id<1> i) {

	const double rho_ =
	((f[5][i] + f[7][i]) + (f[6][i] + f[8][i])) +
	((f[1][i] + f[3][i]) + (f[2][i] + f[4][i])) + f[0][i];

	rho[i] = rho_;

	const double invrho = 1.0/rho_;

	vel[0][i] = invrho * (((f[5][i] - f[7][i]) + (f[8][i] - f[6][i])) + (f[1][i] - f[3][i]));
	vel[1][i] = invrho * (((f[5][i] - f[7][i]) + (f[6][i] - f[8][i])) + (f[2][i] - f[4][i]));

	});

	});

	}

	void collide(sycl::queue &Q,

	sycl::buffer<double,2> b_f,
	sycl::buffer<double,1> b_rho,
	sycl::buffer<double,2> b_vel,
	double omega) {

	Q.submit([&](sycl::handler &h) {

	//sycl::accessor rho(b_rho,h,sycl::read_only);
	//sycl::accessor vel(b_vel,h,sycl::read_only);

	sycl::accessor f(b_f,h,sycl::read_write);

	const double w[3] = {w0, ws, wd};
	const double omegabar = 1.0 - omega;
	const double one_third = 1.0 / 3.0;
	const double one_half = 1.0 / 2.0;

	const double omega_w0 = 3.0 * omega * w[0];
	const double omega_ws = 3.0 * omega * w[1];
	const double omega_wd = 3.0 * omega * w[2];

	h.parallel_for<class collision>(NX*NY,[=](sycl::id<1> i) {

	// --- constants ---


	// --- common terms and direction 0 ---

	const double rho =
	((f[5][i] + f[7][i]) + (f[6][i] + f[8][i])) +
	((f[1][i] + f[3][i]) + (f[2][i] + f[4][i])) + f[0][i];

	const double invrho = 1.0/rho;

	const double velx = invrho * (((f[5][i] - f[7][i]) + (f[8][i] - f[6][i])) + (f[1][i] - f[3][i]));
	const double vely = invrho * (((f[5][i] - f[7][i]) + (f[6][i] - f[8][i])) + (f[2][i] - f[4][i]));

	const double velxx = velx*velx;
	const double velyy = vely*vely;

	const double indp = one_third - one_half * (velxx + velyy);

	f[0][i] = omegabar * f[0][i] + omega_w0 * rho * indp;

	// ------

	const double vel_trm_13 = indp + (3./2.)*velxx;

	f[1][i] = omegabar * f[1][i] + omega_wsrho(vel_trm_13 + velx);
	f[3][i] = omegabar * f[3][i] + omega_wsrho(vel_trm_13 - velx);

	// ------

	const double vel_trm_24 = indp + (3./2.)*velyy;

	f[2][i] = omegabar * f[2][i] + omega_ws * rho * (vel_trm_24 + vely);
	f[4][i] = omegabar * f[4][i] + omega_ws * rho * (vel_trm_24 - vely);

	// ------

	const double velxpy = velx + vely;
	const double vel_trm_57 = indp + (3./2.) * velxpy * velxpy;

	f[5][i] = omegabar * f[5][i] + omega_wd * rho * (vel_trm_57 + velxpy);
	f[7][i] = omegabar * f[7][i] + omega_wd * rho * (vel_trm_57 - velxpy);

	// ------

	const double velxmy = velx - vely;
	const double vel_trm_68 = indp + (3./2.) * velxmy * velxmy;

	f[6][i] = omegabar * f[6][i] + omega_wd * rho * (vel_trm_68 - velxmy);
	f[8][i] = omegabar * f[8][i] + omega_wd * rho * (vel_trm_68 + velxmy);

	});

	});

	}

	void stream(sycl::queue &Q,
	sycl::buffer<double,2> bfold,
	sycl::buffer<double,2> bfnew) {

	Q.submit([&](sycl::handler& h) {

	sycl::accessor fold(bfold,h,sycl::read_only);
	sycl::accessor fnew(bfnew,h,sycl::write_only);

	h.parallel_for(sycl::range{NX,NY},[=](sycl::id<2> id) {

	auto ix = id[0];
	auto iy = id[1];

	size_t xp1 = (ix + 1) % NX;
	size_t xm1 = (ix + NX - 1) % NX;

	size_t yp1 = (iy + 1) % NY;
	size_t ym1 = (iy + NY - 1) % NY;

	size_t curr = ix*NY + iy;

	fnew[0][curr] = fold[0][curr]; /* central cell, no movement */
	#if 1
	fnew[1][curr] = fold[1][iy + xp1NY]; / east */
	fnew[2][curr] = fold[2][yp1 + ixNY]; / north */
	fnew[3][curr] = fold[3][iy + xm1NY]; / west */
	fnew[4][curr] = fold[4][ym1 + ixNY]; / south */

	fnew[5][curr] = fold[5][yp1 + xp1NY]; / north-east */
	fnew[6][curr] = fold[6][yp1 + xm1NY]; / north-west */
	fnew[7][curr] = fold[7][ym1 + xm1NY]; / south-west */
	fnew[8][curr] = fold[8][ym1 + xp1NY]; / south-east */
	#else
	fnew[1][iy + xm1NY] = fold[1][curr]; / east */
	fnew[2][ym1 + ixNY] = fold[2][curr]; / north */
	fnew[3][iy + xp1NY] = fold[3][curr]; / west */
	fnew[4][yp1 + ixNY] = fold[4][curr]; / south */

	fnew[5][ym1 + ym1NY] = fold[5][curr]; / north-east */
	fnew[6][ym1 + xp1NY] = fold[6][curr]; / north-west */
	fnew[7][yp1 + xp1NY] = fold[7][curr]; / south-west */
	fnew[8][yp1 + xm1NY] = fold[8][curr]; / south-east */
	#endif
	});

	});

	}


	std::string filename(int n, std::string basename) {
	char buffer[13];
	std::snprintf(buffer, sizeof(buffer), "%08d.vtk", n);
	return basename.append(buffer);
	}

	int main(int argc, char const *argv[])
	{

	sycl::device my_dev{sycl::default_selector{}};
	std::cout << "Running tests on " << my_dev.get_info<sycl::info::device::name>() << ".\n";

	std::vector<double> fold(NXNY9);
	std::vector<double> fnew(NXNY9);

	std::vector<double> rho(NX*NY);
	std::vector<double> vel(NXNY2);

	// Initial density
	std::fill(rho.begin(),rho.end(),1.0);

	// Initial velocity
	{
	double *vx = &vel[0];
	double vy = &vel[NXNY];
	for (int ix = 0; ix < NX; ++ix) {
	for (int iy = 0; iy < NY; ++iy) {
	const double x = (double) ix + 0.5;
	const double y = (double) iy + 0.5;
	auto vxy = layer({x,y});
	vx[ix*NY + iy] = vxy[0];
	vy[ix*NY + iy] = vxy[1];
	}
	}

	//writeVTK_rectilinear("begin.vtk",NX,NY,rho.data(),vx,vy);
	}


	// Catch asynchronous exceptions
	auto exception_handler = [](sycl::exception_list exceptions) {
	for (std::exception_ptr const &e : exceptions) {
	try {
	std::rethrow_exception(e);
	}
	catch (sycl::exception const &e) {
	std::cout << "Caught asynchronous SYCL "
	"exception during execution:\n"
	<< e.what() << std::endl;
	}
	}
	};

	auto start = std::chrono::steady_clock::now();

	{ /* kernel region */

	sycl::queue Q(my_dev, exception_handler);

	sycl::buffer<double,2> b_fold{fold.data(),{9,NX*NY}};
	sycl::buffer<double,2> b_fnew{fnew.data(),{9,NX*NY}};

	sycl::buffer<double,1> b_rho{rho.data(),{NX*NY}};
	sycl::buffer<double,2> b_vel{vel.data(),{2,NX*NY}};

	set_equilibrium(Q,b_rho,b_vel,b_fold);

	for (int step = 0; step < num_steps; ++step) {

	if (step % 2 == 0) {
	//macros(Q,b_fold,b_rho,b_vel);
	collide(Q,b_fold,b_rho,b_vel,omega);
	stream(Q,b_fold,b_fnew);
	} else {
	//macros(Q,b_fnew,b_rho,b_vel);
	collide(Q,b_fnew,b_rho,b_vel,omega);
	stream(Q,b_fnew,b_fold);
	}

	if (step % num_steps_io == 0 && num_steps_io > 0) {

	if (step % 2 == 0) {
	macros(Q,b_fold,b_rho,b_vel);
	} else {
	macros(Q,b_fnew,b_rho,b_vel);
	}

	sycl::host_accessor h_rho(b_rho);
	sycl::host_accessor h_vel(b_vel);

	double *hvx = h_vel.get_pointer();
	double hvy = hvx + NXNY;

	auto file = filename(step,"lbm");

	writeVTK_rectilinear(file.c_str(),NX,NY,h_rho.get_pointer(),hvx,hvy);
	}

	}

	} /* end kernel region */

	auto end = std::chrono::steady_clock::now();
	std::chrono::duration<double> elapsed_seconds = end - start;

	std::cout << "MLUPS = " << double(NX)double(NY)double(num_steps)*1.e-6/elapsed_seconds.count() << '\n';

	#if 0
	{
	double *r = rho.data();
	double *vx = &vel[0];
	double vy = &vel[NXNY];
	writeVTK_rectilinear("end.vtk",NX,NY,r,vx,vy);
	}
	#endif

	return 0;
	}
	CXX=icpx
	CXXFLAGS=-Wall -g -O2 -march=native

	.PHONY: all
	all: example

	example: example.cpp
	$(CXX) -o $@ $(CXXFLAGS) -fsycl -fsycl-targets=spir64_x86_64 $<

	.PHONY: clean
	clean:
	rm -f example