Skip to content

Instantly share code, notes, and snippets.

@nandor
Last active February 25, 2016 11:26
Show Gist options
  • Save nandor/cbc43cc57ffb448253df to your computer and use it in GitHub Desktop.
Save nandor/cbc43cc57ffb448253df to your computer and use it in GitHub Desktop.
SSE + AVX Challenge
// AVX example. Compile with:
// clang++ simd.cc -O3 -osimd -lopencv_core -lopencv_highgui -lopencv_imgproc -std=c++14 -mavx -mfma
#include <chrono>
#include <iostream>
#include <immintrin.h>
#include <opencv2/opencv.hpp>
// Alignment requirement.
static constexpr int ALIGN = 32;
static_assert((ALIGN & (ALIGN - 1)) == 0, "ALIGN must be a power of two.");
// Variance of the Gaussian distribution.
static constexpr float SIGMA = 0.2f;
/**
* Generates an element of a gaussian distribution.
*/
float gaussian(int i, int r) {
float x = i / r;
return exp(-(x * x) / (2 * SIGMA * SIGMA));
}
static const float G[5] = {
gaussian(-2, 2),
gaussian(-1, 2),
gaussian( 0, 2),
gaussian(+1, 2),
gaussian(+2, 2)
};
/**
* Rounds a number to a power of two.
*/
template<typename T>
T round(const T& n, const T& r) {
assert((r & (r - 1)) == 0);
if ((n & (r - 1)) == 0) {
// If number is already a multiple of the alignment, return.
return n;
} else {
// Otherwise, increase the number and round off the last bits.
return (n + r) & ~(r - 1);
}
}
/**
* Aligns a pointer to an address that is a multiple of a given number.
*/
template<typename T>
T *align(T* ptr, size_t N) {
union {
uintptr_t addr;
T *ptr;
} u;
u.ptr = ptr;
u.addr = round(u.addr, N);
return u.ptr;
}
/**
* Special image that aligns all rows to a 32 byte boundary, as required by
* VMOVAPS. It should be noted that the performance penalty from misaligned
* data here is not as significant as the penalty of using an unaligned 16
* byte load.
*/
class AlignedImage {
public:
/**
* Creates an empty image.
*
* The allocated buffer is 32 bytes larger than the requested buffer.
* This is done in order for us to be able to move to slide the start
* address of the image to the nearest address that is aligned to a 32 byte
* boundary, which happens to be at most 32 bytes apart.
*/
AlignedImage(int rows, int cols)
: rows_(rows)
, cols_(cols)
, stride_(round(cols * 4, ALIGN))
, data_(std::make_unique<uint8_t[]>(rows_ * stride_ + ALIGN))
, image_(static_cast<void*>(align(data_.get(), ALIGN)))
{
}
/**
* Creates the image by copying a CV matrix.
*/
explicit AlignedImage(const cv::Mat &mat)
: AlignedImage(mat.rows, mat.cols)
{
assert(mat.channels() == 1);
assert(mat.type() == CV_32FC1);
for (int i = 0; i < mat.rows; ++i) {
memcpy(this->operator[](i), mat.ptr<float>(i), mat.cols * sizeof(float));
}
}
/**
* Converts the image to an OpenCV matrix.
*/
operator cv::Mat () {
// Make sure to clone the data so the matrix can be used even if the
// image goes out of scope and the unique pointer is deallocated.
return cv::Mat(rows_, cols_, CV_32FC1, image_, stride_).clone();
}
float *operator[] (size_t n) {
assert(n < rows_);
return static_cast<float*>(image_) + n * (stride_ / sizeof(float));
}
const float *operator[] (size_t n) const {
return (const_cast<AlignedImage*>(this))->operator[](n);
}
// Accessors for properties.
int getRows() const { return rows_; }
int getCols() const { return cols_; }
private:
// Size of the image.
int rows_;
int cols_;
// The stride represents the length of a single row, multiple of 32 bytes.
int stride_;
// Raw, unaligned data.
std::unique_ptr<uint8_t[]> data_;
// 32-byte aligned offset in the buffer.
void *image_;
};
/**
* Applies the bilateral filter on an image.
*
* This implementation follows the one from kfusion:
* https://github.com/GerhardR/kfusion/blob/master/kfusion.cu#L133
*/
void bilateralFilterNaive(const AlignedImage &in, AlignedImage &out) {
assert(in.getRows() == out.getRows());
assert(in.getCols() == out.getCols());
for (int r = 2; r < in.getRows() - 2; ++r) {
for (int c = 2; c < in.getCols() - 2; ++c) {
float center = in[r][c];
float accum = 0.0f;
float weight = 0.0f;
for (int i = -2; i <= 2; ++i) {
for (int j = -2; j <= 2; ++j) {
float pix = in[r + i][c + j];
float d = pix - center;
float w = G[i + 2] * G[j + 2] * exp(-(d * d) / (2 * SIGMA * SIGMA));
accum += w * pix;
weight += w;
}
}
out[r][c] = accum / weight;
}
}
}
/**
* Applies an AVX optimized bilateral filter to an image.
*/
void bilateralFilterAVX(const AlignedImage &in, AlignedImage &out) {
assert(in.getRows() == out.getRows());
assert(in.getCols() == out.getCols());
for (int r = 2; r < in.getRows() - 2; ++r) {
for (int c = 2; c < in.getCols() - 2; ++c) {
out[r][c] = in[r][c];
}
}
}
/**
* Entry point of the application. Loads an image & smooths it.
*/
int main(int argc, char **argv) {
if (argc != 3) {
std::cerr << "Usage: simd [image] [avx|naive]" << std::endl;
return EXIT_FAILURE;
}
// Load the image using OpenCV and convert it to grayscale float.
cv::Mat gray;
{
const auto rgb = cv::imread(argv[1], cv::IMREAD_COLOR);
cv::cvtColor(rgb, gray, CV_BGR2GRAY);
gray.convertTo(gray, CV_32FC1);
gray = gray / 255.0f;
}
// Create the input and output images.
AlignedImage input(gray);
AlignedImage output(gray.rows, gray.cols);
// Run the bilateral filter.
{
const auto start = std::chrono::high_resolution_clock::now();
{
const std::string method(argv[2]);
if (method == "naive") {
bilateralFilterNaive(input, output);
} else if (method == "avx") {
bilateralFilterAVX(input, output);
} else {
std::cerr << "Invalid method." << std::endl;
return EXIT_FAILURE;
}
}
const auto end = std::chrono::high_resolution_clock::now();
std::cout
<< "Duration: "
<< (std::chrono::duration<double>(end - start)).count()
<< std::endl;
}
// Show the output image.
cv::imshow("input", (cv::Mat)input);
cv::imshow("output", (cv::Mat)output);
while (cv::waitKey(0) != 'q');
return EXIT_SUCCESS;
}
@egospodinova
Copy link

Add #include <memory> for std::unique_ptr<>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment