Skip to content

Instantly share code, notes, and snippets.

@itzmeanjan
Last active Sep 19, 2021
Embed
What would you like to do?
😎 Computing Julia Set on CPU + GPGPU, using SYCL DPC++ πŸš€
#include <CL/sycl.hpp>
#include <chrono>
#include <iostream>
using namespace sycl;
namespace ts = std::chrono;
constexpr uint N = 512;
constexpr uint B = 4;
constexpr float R = 2.f; // radius
constexpr uint MAX_ITR = 1 << 10;
float length(sycl::cl_float2 v) { return sqrtf(v.x() * v.x() + v.y() * v.y()); }
sycl::cl_float2 get_complex_value_at_index(uint row, uint col) {
float half = (float)N / 2.f;
float step = 1.f;
float factor = sqrtf(powf(half, 2) + powf(half, 2));
return {((-half + (float)row * step) / factor) * R,
((half - (float)col * step) / factor) * R};
}
uint quadratic_iteration(sycl::cl_float2 z, sycl::cl_float2 c) {
uint itr = 0;
for (; itr < MAX_ITR; itr++) {
z = {z.x() * z.x() - z.y() * z.y() + c.x(), 2 * z.y() * z.x() + c.y()};
if (length(z) > R) {
break;
}
}
return itr;
}
sycl::cl_float4 colorize(uint itr) {
sycl::cl_float3 d = {0.5, 0.5, 0.5};
sycl::cl_float3 e = {0.5, 0.5, 0.5};
sycl::cl_float3 f = {2.0, 1.0, 0.0};
sycl::cl_float3 g = {0.5, 0.2, 0.25};
sycl::cl_float4 pixel_val = {
d + e * cos(6.28318f * (f * ((float)itr / (float)(MAX_ITR)) + g)), 1.f};
return pixel_val;
}
int64_t compute_julia_set_dentrite(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {0.f, 1.f};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
int64_t compute_julia_set_douadys_rabbit(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {-0.123, 0.745};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
int64_t compute_julia_set_san_marco(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {-0.750, 0.f};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
int64_t compute_julia_set_siegel_disk(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {-0.391, -0.587};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
int64_t compute_julia_set_neat_cauliflower(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {-0.7, -0.3};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
int64_t compute_julia_set_galaxies(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {-0.75, -0.2};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
int64_t compute_julia_set_groovy(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {-0.75, 0.15};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
int64_t compute_julia_set_frost(queue &q, void *data) {
image<2> img{data, image_channel_order::rgba, image_channel_type::unorm_int8,
range<2>{N, N}};
auto start = ts::steady_clock::now();
q.submit([&](handler &h) {
accessor<float4, 2, access::mode::write, access::target::image> acc{img, h};
h.parallel_for(nd_range<2>{range<2>{N, N}, range<2>{B, 1}},
[=](nd_item<2> it) {
const uint i = it.get_global_id()[0];
const uint j = it.get_global_id()[1];
const sycl::cl_float2 c = {-0.7f, 0.35};
sycl::cl_float2 z = get_complex_value_at_index(i, j);
uint itr = quadratic_iteration(z, c);
acc.write(int2(i, j), colorize(itr));
});
});
q.wait();
auto end = ts::steady_clock::now();
return ts::duration_cast<ts::milliseconds>(end - start).count();
}
size_t export_to_file(void *data, size_t size, char const *path) {
FILE *fp = fopen(path, "wb");
size_t n = fwrite(data, size, 1, fp);
fclose(fp);
return size * n;
}
int main(int argc, char **argv) {
device d{host_selector{}};
queue q(d, ([](exception_list l) {
for (auto ep : l) {
try {
std::rethrow_exception(ep);
} catch (const cl::sycl::exception &e) {
std::cout << "Async exception caught:\n" << e.what() << "\n";
throw;
}
}
}));
size_t size = N * N * 4;
uint8_t *data = (uint8_t *)malloc(size * sizeof(uint8_t));
memset(data, 0, size); // redundant here though
auto tm = compute_julia_set_dentrite(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
uint total = export_to_file(data, size, "julia_set.dentrite.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
memset(data, 0, size); // redundant here though
tm = compute_julia_set_douadys_rabbit(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
total = export_to_file(data, size, "julia_set.douadys_rabbit.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
memset(data, 0, size); // redundant here though
tm = compute_julia_set_san_marco(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
total = export_to_file(data, size, "julia_set.san_marco.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
memset(data, 0, size); // redundant here though
tm = compute_julia_set_siegel_disk(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
total = export_to_file(data, size, "julia_set.siegel_disk.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
memset(data, 0, size); // redundant here though
tm = compute_julia_set_neat_cauliflower(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
total = export_to_file(data, size, "julia_set.neat_cauliflower.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
memset(data, 0, size); // redundant here though
tm = compute_julia_set_galaxies(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
total = export_to_file(data, size, "julia_set.galaxies.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
memset(data, 0, size); // redundant here though
tm = compute_julia_set_groovy(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
total = export_to_file(data, size, "julia_set.groovy.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
memset(data, 0, size); // redundant here though
tm = compute_julia_set_frost(q, data);
std::cout << "computed in " << tm << " ms" << std::endl;
total = export_to_file(data, size, "julia_set.frost.raw");
std::cout << "wrote " << total << " bytes" << std::endl;
free(data);
return 0;
}
@itzmeanjan
Copy link
Author

itzmeanjan commented Sep 17, 2021

Introduction

I program Julia Set computation on accelerator devices ( i.e. CPU, GPGPU, FPGA etc. ) using SYCL DPC++. Given that during Julia Set computation each pixel can be processed independently, without any data dependencies, this problem is naturally a good fit for parallel computation.

More about Julia set: https://en.wikipedia.org/wiki/Julia_set

Usage

Install DPC++ compiler toolchain, following https://intel.github.io/llvm-docs/....

Compile with

clang++ -fsycl julia_set.cpp

Run with

./a.out

Visualise fractal image

magick -size 2048x2048 -depth 8 RGBA:julia_set.raw julia_set.png

Make sure image dimension matches with https://gist.github.com/itzmeanjan/...

@itzmeanjan
Copy link
Author

itzmeanjan commented Sep 17, 2021

Generated fractal image looks like πŸ‘‡

julia

@itzmeanjan
Copy link
Author

itzmeanjan commented Sep 19, 2021

c Fractal
-0.391 -0.587j julia_set siegel_disk raw
0.f + 1.0j julia_set dentrite raw
-0.123 + 0.745j julia_set douadys_rabbit raw
-0.7f + 0.35j julia_set frost raw
-0.75 -0.2j julia_set galaxies raw
-0.75 + 0.15j julia_set groovy raw
-0.7 -0.3j julia_set neat_cauliflower raw
-0.75 + 0.0j julia_set san_marco raw

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment