Skip to content

Instantly share code, notes, and snippets.

@Mark1626
Last active April 1, 2023 11:41
Show Gist options
  • Save Mark1626/c1f4aebef5b085f74e05b7d3457a3d5b to your computer and use it in GitHub Desktop.
Save Mark1626/c1f4aebef5b085f74e05b7d3457a3d5b to your computer and use it in GitHub Desktop.
Benchmark to see if Hilbert curve property of cache locality can improve performance in a 2D Median Filter.
/*
Author: Nimalan M @mark1626
Comment: Minor improvement using a Hilbert curve when using -O3 + march=native, machine used AMD Ryzen 5800X
Note: Parallelism for the Hilbert version has not been completed
Credits: Function to calculate next position in Hilbert curve from Hacker's Delight (Second Edition) by Henry S. Warren, Jr.
*/
#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <vector>
#include <algorithm>
#include <cmath>
using namespace std::chrono;
const int NDIM = 2;
const double TOLERANCE = 0.001;
inline void swap(float &a, float &b) {
float temp = a;
a = b;
b = temp;
}
float qselect(float *arr, int len, int nth) {
int start = 0;
for (int index = 0; index < len - 1; index++) {
if (arr[index] > arr[len - 1])
continue;
swap(arr[index], arr[start]);
start++;
}
swap(arr[len - 1], arr[start]);
if (nth == start)
return arr[start];
return start > nth ? qselect(arr, start, nth)
: qselect(arr + start, len - start, nth - start);
}
void sliding_window(const float *arr, const bool *mask, float *res1,
const int shape[NDIM], const int hboxsz[NDIM],
const int resShape[NDIM]) {
const int bxlen = 2*hboxsz[0] + 1;
const int bylen = 2*hboxsz[1] + 1;
const int xlen = shape[0];
const int size = bxlen * bylen;
// float *tmp = new float[size];
#pragma omp parallel shared(res)
{
std::vector<float> tmp(size);
#pragma omp for
for (int y = 0; y < resShape[1]; y++) {
for (int x = 0; x < resShape[0]; x++) {
int pos[NDIM];
int blc[NDIM];
int trc[NDIM];
pos[0] = x;
pos[1] = y;
blc[0] = pos[0];
blc[1] = pos[1];
trc[0] = blc[0] + bxlen;
trc[1] = blc[1] + bxlen;
int len = 0;
for (int yy = blc[1]; yy < trc[1]; yy++) {
for (int xx = blc[0]; xx < trc[0]; xx++) {
int idx = yy * xlen + xx;
if (mask[idx]) {
tmp[len] = arr[idx];
len++;
}
}
}
auto median = 0.0f;
if (tmp.size() > 0) {
auto end = tmp.begin() + len;
auto mid = tmp.begin() + (len / 2);
std::nth_element(tmp.begin(), mid, end);
median = tmp[len / 2];
}
// auto median = 0.0f;
// if (len > 0) {
// auto mid = len / 2;
// median = qselect(tmp, len, mid);
// }
res1[y * resShape[0] + x] = median;
//res2[y * resShape[0] + x] = mid;
}
}
// delete [] tmp;
}
}
void hil_inc_xy(unsigned *xp, unsigned *yp, int n) {
int i;
unsigned x, y, state, dx, dy, row, docchange;
x = *xp;
y = *yp;
state = 0;
dx = -((1<<n) - 1);
dy = 0;
for (i = n-1; i >= 0; i--) {
row = 4*state | 2*((x>>i) & 1) | ((y >> i) & 1);
docchange = (0xBDDB >> row) & 1;
if (docchange) {
dx = ((0x16451659 >> 2*row) & 3) - 1;
dy = ((0x51166516 >> 2*row) & 3) - 1;
}
state = (0x8FE65831 >> 2*row) & 3;
}
*xp = *xp + dx;
*yp = *yp + dy;
}
void hilbert_sliding_window(const float *arr, const bool *mask, float *res1,
const int shape[NDIM], const int hboxsz[NDIM],
const int resShape[NDIM]) {
const int bxlen = 2*hboxsz[0] + 1;
const int bylen = 2*hboxsz[1] + 1;
const int xlen = shape[0];
unsigned int dim = ceil(log2(resShape[1]));
unsigned int arr_dim = 1 << dim;
printf("Dim: %d %d\n", arr_dim, arr_dim);
unsigned int size = arr_dim * arr_dim;
printf("Size: %d\n", size);
printf("Res Size: %d %d\n", resShape[0], resShape[1]);
unsigned int x = 0;
unsigned int y = 0;
// float *tmp = new float[size];
// #pragma omp parallel shared(res)
{
std::vector<float> tmp(size);
// #pragma omp for
for (unsigned int i = 0; i < size; i++) {
if (x < resShape[0] && y < resShape[1]) {
int pos[NDIM];
int blc[NDIM];
int trc[NDIM];
pos[0] = x;
pos[1] = y;
blc[0] = pos[0];
blc[1] = pos[1];
trc[0] = blc[0] + bxlen;
trc[1] = blc[1] + bxlen;
// std::vector<float> tmp;
// float *tmp = new float[bxlen * bylen];
int len = 0;
for (int yy = blc[1]; yy < trc[1]; yy++) {
for (int xx = blc[0]; xx < trc[0]; xx++) {
int idx = yy * xlen + xx;
if (mask[idx]) {
tmp[len] = arr[idx];
len++;
}
}
}
auto median = 0.0f;
if (tmp.size() > 0) {
auto end = tmp.begin() + len;
auto mid = tmp.begin() + (len / 2);
std::nth_element(tmp.begin(), mid, end);
median = tmp[len / 2];
}
// auto median = 0.0f;
// if (len > 0) {
// auto mid = len / 2;
// median = qselect(tmp, len, mid);
// }
res1[y * resShape[0] + x] = median;
//res2[y * resShape[0] + x] = mid;
// delete[] tmp;
// printf("(%d, %d) Size %d m: %f \n", x, y, tmp.size(), res1[y * resShape[0] + x]);
}
hil_inc_xy(&x, &y, dim);
}
// delete [] tmp;
}
}
void experiment(int SIZE, int WINDOW_SIZE) {
int shape[NDIM] = {SIZE, SIZE};
int hboxsz[NDIM] = {WINDOW_SIZE, WINDOW_SIZE};
int boxsz[NDIM] = {2*hboxsz[0]+1, 2*hboxsz[1]+1};
int resShape[NDIM] = {shape[0] - boxsz[0], shape[1] - boxsz[1]};
size_t arrSize = sizeof(float) * shape[0] * shape[1];
size_t maskSize = sizeof(bool) * shape[0] * shape[1];
size_t resSize = sizeof(float) * resShape[0] * resShape[1];
size_t ndim = 2;
float *arr = new float[arrSize];
bool *mask = new bool[maskSize];
double a = 5.0;
size_t total_masked = 0;
for (int y = 0; y < shape[1]; y++) {
for (int x = 0; x < shape[0]; x++) {
float val = (double)std::rand() / (double)(RAND_MAX / a);
arr[y * shape[0] + x] = val;
mask[y * shape[0] + x] = val > 1.0;
total_masked += mask[y * shape[0] + x] ? 1 : 0;
}
}
printf("Percent masked: %f\n", ((double)total_masked)/((double)arrSize));
float *res_golden = new float[resShape[0] * resShape[1]];
float *res_hilbert = new float[resShape[0] * resShape[1]];
printf("Image size %d %d\n", SIZE, SIZE);
printf("Window size %d %d\n", WINDOW_SIZE, WINDOW_SIZE);
printf("Result size %d %d\n", resShape[0], resShape[1]);
auto t1 = high_resolution_clock::now();
auto t2 = high_resolution_clock::now();
#ifdef GOLDEN
t1 = high_resolution_clock::now();
sliding_window(arr, mask, res_golden, shape, hboxsz, resShape);
t2 = high_resolution_clock::now();
printf("Time taken CPU Median filter: %lld ms\n",
duration_cast<milliseconds>(t2 - t1).count());
#endif
#ifdef HILBERT
t1 = high_resolution_clock::now();
hilbert_sliding_window(arr, mask, res_hilbert, shape, hboxsz, resShape);
t2 = high_resolution_clock::now();
printf("Time taken Hilbert Median filter: %lld ms\n",
duration_cast<milliseconds>(t2 - t1).count());
#endif
#ifdef ASSERT
for (int y = 0; y < resShape[1]; y++) {
for (int x = 0; x < resShape[0]; x++) {
float actual = res_hilbert[y * resShape[0] + x];
float expected = res_golden[y * resShape[0] + x];
float deviation = (expected - actual) / expected;
if (deviation > 0.05) {
fprintf(stderr,
"Assertion failed value at %d %d expected: %.2f actual: %.2f deviation: %.2f\n",
x, y, expected, actual, deviation);
}
// actual = res_gpu_2[y * resShape[0] + x];
// expected = res_cpu_2[y * resShape[0] + x];
// if (fabs(actual - expected) > TOLERANCE) {
// fprintf(stderr,
// "Assertion failed value 2 at %d %d expected: %f actual: %f\n",
// x, y, expected, actual);
// }
}
}
printf("Assertions complete\n");
#endif
delete []res_golden;
delete []res_hilbert;
delete []arr;
delete []mask;
}
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "Usage:\n exp-2 <SIZE> <WINDOW_SIZE>\n");
return 1;
}
int SIZE = std::atoi(argv[1]);
int WINDOW_SIZE = std::atoi(argv[2]);
// unsigned x = 0;
// unsigned y = 0;
// for (int i = 0; i < 64; i++) {
// printf("(%d, %d)\n", x, y);
// hil_inc_xy(&x, &y, 8);
// }
experiment(SIZE, WINDOW_SIZE);
}

Summary

Minor improvement using a Hilbert curve when using -O3 + march=native, machine used AMD Ryzen 5800X

-O3

Built with

c++ -o hilbert-gcc hilbert.cc -O3 -std=c++17 -g -fno-omit-frame-pointer -DGOLDEN -DHILBERT
clang++ -o hilbert-clang hilbert.cc -O3 -std=c++17 -g -fno-omit-frame-pointer -DGOLDEN -DHILBERT
(base) [nimalan@localhost ctest]$ ./hilbert-gcc 1124 50
Percent masked: 0.200070
Image size 1124 1124
Window size 50 50
Result size 1023 1023
Time taken CPU Median filter: 73317 ms
Dim: 1024 1024
Size: 1048576
Res Size: 1023 1023
Time taken Hilbert Median filter: 73159 ms
(base) [nimalan@localhost ctest]$ ./hilbert-clang 1124 50
Percent masked: 0.200070
Image size 1124 1124
Window size 50 50
Result size 1023 1023
Time taken CPU Median filter: 74025 ms
Dim: 1024 1024
Size: 1048576
Res Size: 1023 1023
Time taken Hilbert Median filter: 71439 ms

perf diff

# Event 'cycles:u'
#
# Baseline  Delta Abs  Shared Object         Symbol                                                                                                                                            
# ........  .........  ....................  ..................................................................................................................................................
#
              +84.96%  hilbert-only-clang    [.] std::__introselect<__gnu_cxx::__normal_iterator<float*, std::vector<float, std::allocator<float> > >, long, __gnu_cxx::__ops::_Iter_less_iter>
              +14.98%  hilbert-only-clang    [.] hilbert_sliding_window
     0.10%     -0.04%  [unknown]             [k] 0xffffffffa3a06beb
               +0.00%  hilbert-only-clang    [.] experiment
     0.00%     +0.00%  ld-linux-x86-64.so.2  [.] handle_amd
    84.17%             hilbert-only          [.] std::__introselect<__gnu_cxx::__normal_iterator<float*, std::vector<float, std::allocator<float> > >, long, __gnu_cxx::__ops::_Iter_less_iter>
    15.73%             hilbert-only          [.] hilbert_sliding_window
     0.00%             hilbert-only          [.] experiment
     0.00%             ld-linux-x86-64.so.2  [.] _dl_start
(base) [nimalan@localhost ctest]$ perf report -g -i perf.data
(base) [nimalan@localhost ctest]$ clang++ -o hilbert-only-clang hilbert.cc -O3 -std=c++17 -g -fno-omit-frame-pointer -DHILBERT -march=native

--------------------------------------------------------------------

-O3 + march=native

Built with

c++ -o hilbert-gcc hilbert.cc -O3 -std=c++17 -g -fno-omit-frame-pointer -DGOLDEN -DHILBERT -march=native
clang++ -o hilbert-clang hilbert.cc -O3 -std=c++17 -g -fno-omit-frame-pointer -DGOLDEN -DHILBERT -march=native

Overall Comparison

(base) [nimalan@localhost ctest]$ ./hilbert-gcc 1124 50
Percent masked: 0.200070
Image size 1124 1124
Window size 50 50
Result size 1023 1023
Time taken CPU Median filter: 73458 ms
Dim: 1024 1024
Size: 1048576
Res Size: 1023 1023
Time taken Hilbert Median filter: 72654 ms
(base) [nimalan@localhost ctest]$ ./hilbert-clang 1124 50
Percent masked: 0.200070
Image size 1124 1124
Window size 50 50
Result size 1023 1023
Time taken CPU Median filter: 75060 ms
Dim: 1024 1024
Size: 1048576
Res Size: 1023 1023
Time taken Hilbert Median filter: 72768 ms

perf diff

(base) [nimalan@localhost ctest]$ perf record -g -F99 ./hilbert-only 1124 50
Percent masked: 0.200070
Image size 1124 1124
Window size 50 50
Result size 1023 1023
Dim: 1024 1024
Size: 1048576
Res Size: 1023 1023
Time taken Hilbert Median filter: 72190 ms
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.485 MB perf.data (7154 samples) ]
(base) [nimalan@localhost ctest]$ perf record -g -F99 ./hilbert-only-clang 1124 50
Percent masked: 0.200070
Image size 1124 1124
Window size 50 50
Result size 1023 1023
Dim: 1024 1024
Size: 1048576
Res Size: 1023 1023
Time taken Hilbert Median filter: 71963 ms
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.437 MB perf.data (7128 samples) ]

# Event 'cycles:u'
#
# Baseline  Delta Abs  Shared Object         Symbol                                                                                                                                            
# ........  .........  ....................  ..................................................................................................................................................
#
              +83.71%  hilbert-only-clang    [.] std::__introselect<__gnu_cxx::__normal_iterator<float*, std::vector<float, std::allocator<float> > >, long, __gnu_cxx::__ops::_Iter_less_iter>
              +16.23%  hilbert-only-clang    [.] hilbert_sliding_window
     0.07%     -0.01%  [unknown]             [k] 0xffffffffa3a06beb
               +0.00%  hilbert-only-clang    [.] experiment
     0.00%     +0.00%  ld-linux-x86-64.so.2  [.] handle_amd
    84.28%             hilbert-only          [.] std::__introselect<__gnu_cxx::__normal_iterator<float*, std::vector<float, std::allocator<float> > >, long, __gnu_cxx::__ops::_Iter_less_iter>
    15.64%             hilbert-only          [.] hilbert_sliding_window
     0.00%             hilbert-only          [.] experiment
     0.00%             ld-linux-x86-64.so.2  [.] _dl_start

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment