nandor/simd.cc

## simd.cc
// AVX example. Compile with:
// clang++ simd.cc -O3 -osimd -lopencv_core -lopencv_highgui -lopencv_imgproc -std=c++14 -mavx -mfma

#include <chrono>
#include <iostream>

#include <immintrin.h>

#include <opencv2/opencv.hpp>


// Alignment requirement.
static constexpr int ALIGN = 32;
static_assert((ALIGN & (ALIGN - 1)) == 0, "ALIGN must be a power of two.");

// Variance of the Gaussian distribution.
static constexpr float SIGMA = 0.2f;


/**
 * Generates an element of a gaussian distribution.
 */
float gaussian(int i, int r) {
  float x = i / r;
  return exp(-(x * x) / (2 * SIGMA * SIGMA));
}

static const float G[5] = {
  gaussian(-2, 2),
  gaussian(-1, 2),
  gaussian( 0, 2),
  gaussian(+1, 2),
  gaussian(+2, 2)
};


/**
 * Rounds a number to a power of two.
 */
template<typename T>
T round(const T& n, const T& r) {
  assert((r & (r - 1)) == 0);

  if ((n & (r - 1)) == 0) {
    // If number is already a multiple of the alignment, return.
    return n;
  } else {
    // Otherwise, increase the number and round off the last bits.
    return (n + r) & ~(r - 1);
  }
}

/**
 * Aligns a pointer to an address that is a multiple of a given number.
 */
template<typename T>
T *align(T* ptr, size_t N) {
  union {
    uintptr_t addr;
    T *ptr;
  } u;
  u.ptr = ptr;
  u.addr = round(u.addr, N);
  return u.ptr;
}


/**
 * Special image that aligns all rows to a 32 byte boundary, as required by
 * VMOVAPS. It should be noted that the performance penalty from misaligned
 * data here is not as significant as the penalty of using an unaligned 16
 * byte load.
 */
class AlignedImage {
 public:
  /**
   * Creates an empty image.
   *
   * The allocated buffer is 32 bytes larger than the requested buffer.
   * This is done in order for us to be able to move to slide the start
   * address of the image to the nearest address that is aligned to a 32 byte
   * boundary, which happens to be at most 32 bytes apart.
   */
  AlignedImage(int rows, int cols)
    : rows_(rows)
    , cols_(cols)
    , stride_(round(cols * 4, ALIGN))
    , data_(std::make_unique<uint8_t[]>(rows_ * stride_ + ALIGN))
    , image_(static_cast<void*>(align(data_.get(), ALIGN)))
  {
  }

  /**
   * Creates the image by copying a CV matrix.
   */
  explicit AlignedImage(const cv::Mat &mat)
    : AlignedImage(mat.rows, mat.cols)
  {
    assert(mat.channels() == 1);
    assert(mat.type() == CV_32FC1);

    for (int i = 0; i < mat.rows; ++i) {
      memcpy(this->operator[](i), mat.ptr<float>(i), mat.cols * sizeof(float));
    }
  }

  /**
   * Converts the image to an OpenCV matrix.
   */
  operator cv::Mat () {
    // Make sure to clone the data so the matrix can be used even if the
    // image goes out of scope and the unique pointer is deallocated.
    return cv::Mat(rows_, cols_, CV_32FC1, image_, stride_).clone();
  }

  float *operator[] (size_t n) {
    assert(n < rows_);
    return static_cast<float*>(image_) + n * (stride_ / sizeof(float));
  }

  const float *operator[] (size_t n) const {
    return (const_cast<AlignedImage*>(this))->operator[](n);
  }


  // Accessors for properties.
  int getRows() const { return rows_; }
  int getCols() const { return cols_; }

 private:
  // Size of the image.
  int rows_;
  int cols_;

  // The stride represents the length of a single row, multiple of 32 bytes.
  int stride_;

  // Raw, unaligned data.
  std::unique_ptr<uint8_t[]> data_;

  // 32-byte aligned offset in the buffer.
  void *image_;
};


/**
 * Applies the bilateral filter on an image.
 *
 * This implementation follows the one from kfusion:
 *   https://github.com/GerhardR/kfusion/blob/master/kfusion.cu#L133
 */
void bilateralFilterNaive(const AlignedImage &in, AlignedImage &out) {
  assert(in.getRows() == out.getRows());
  assert(in.getCols() == out.getCols());

  for (int r = 2; r < in.getRows() - 2; ++r) {
    for (int c = 2; c < in.getCols() - 2; ++c) {
      float center = in[r][c];

      float accum = 0.0f;
      float weight = 0.0f;

      for (int i = -2; i <= 2; ++i) {
        for (int j = -2; j <= 2; ++j) {
          float pix = in[r + i][c + j];

          float d = pix - center;
          float w = G[i + 2] * G[j + 2] * exp(-(d * d) / (2 * SIGMA * SIGMA));

          accum += w * pix;
          weight += w;
        }
      }

      out[r][c] = accum / weight;
    }
  }
}


/**
 * Applies an AVX optimized bilateral filter to an image.
 */
void bilateralFilterAVX(const AlignedImage &in, AlignedImage &out) {
  assert(in.getRows() == out.getRows());
  assert(in.getCols() == out.getCols());

  for (int r = 2; r < in.getRows() - 2; ++r) {
    for (int c = 2; c < in.getCols() - 2; ++c) {
      out[r][c] = in[r][c];
    }
  }
}


/**
 * Entry point of the application. Loads an image & smooths it.
 */
int main(int argc, char **argv) {

  if (argc != 3) {
    std::cerr << "Usage: simd [image] [avx|naive]" << std::endl;
    return EXIT_FAILURE;
  }

  // Load the image using OpenCV and convert it to grayscale float.
  cv::Mat gray;
  {
    const auto rgb = cv::imread(argv[1], cv::IMREAD_COLOR);
    cv::cvtColor(rgb, gray, CV_BGR2GRAY);
    gray.convertTo(gray, CV_32FC1);
    gray = gray / 255.0f;
  }

  // Create the input and output images.
  AlignedImage input(gray);
  AlignedImage output(gray.rows, gray.cols);

  // Run the bilateral filter.
  {
    const auto start = std::chrono::high_resolution_clock::now();
    {
      const std::string method(argv[2]);
      if (method == "naive") {
        bilateralFilterNaive(input, output);
      } else if (method == "avx") {
        bilateralFilterAVX(input, output);
      } else {
        std::cerr << "Invalid method." << std::endl;
        return EXIT_FAILURE;
      }
    }
    const auto end = std::chrono::high_resolution_clock::now();

    std::cout
        << "Duration: "
        << (std::chrono::duration<double>(end - start)).count()
        << std::endl;
  }

  // Show the output image.
  cv::imshow("input", (cv::Mat)input);
  cv::imshow("output", (cv::Mat)output);
  while (cv::waitKey(0) != 'q');

  return EXIT_SUCCESS;
}
	// AVX example. Compile with:
	// clang++ simd.cc -O3 -osimd -lopencv_core -lopencv_highgui -lopencv_imgproc -std=c++14 -mavx -mfma

	#include <chrono>
	#include <iostream>

	#include <immintrin.h>

	#include <opencv2/opencv.hpp>


	// Alignment requirement.
	static constexpr int ALIGN = 32;
	static_assert((ALIGN & (ALIGN - 1)) == 0, "ALIGN must be a power of two.");

	// Variance of the Gaussian distribution.
	static constexpr float SIGMA = 0.2f;


	/**
	* Generates an element of a gaussian distribution.
	*/
	float gaussian(int i, int r) {
	float x = i / r;
	return exp(-(x * x) / (2 * SIGMA * SIGMA));
	}

	static const float G[5] = {
	gaussian(-2, 2),
	gaussian(-1, 2),
	gaussian( 0, 2),
	gaussian(+1, 2),
	gaussian(+2, 2)
	};


	/**
	* Rounds a number to a power of two.
	*/
	template<typename T>
	T round(const T& n, const T& r) {
	assert((r & (r - 1)) == 0);

	if ((n & (r - 1)) == 0) {
	// If number is already a multiple of the alignment, return.
	return n;
	} else {
	// Otherwise, increase the number and round off the last bits.
	return (n + r) & ~(r - 1);
	}
	}

	/**
	* Aligns a pointer to an address that is a multiple of a given number.
	*/
	template<typename T>
	T align(T ptr, size_t N) {
	union {
	uintptr_t addr;
	T *ptr;
	} u;
	u.ptr = ptr;
	u.addr = round(u.addr, N);
	return u.ptr;
	}


	/**
	* Special image that aligns all rows to a 32 byte boundary, as required by
	* VMOVAPS. It should be noted that the performance penalty from misaligned
	* data here is not as significant as the penalty of using an unaligned 16
	* byte load.
	*/
	class AlignedImage {
	public:
	/**
	* Creates an empty image.
	*
	* The allocated buffer is 32 bytes larger than the requested buffer.
	* This is done in order for us to be able to move to slide the start
	* address of the image to the nearest address that is aligned to a 32 byte
	* boundary, which happens to be at most 32 bytes apart.
	*/
	AlignedImage(int rows, int cols)
	: rows_(rows)
	, cols_(cols)
	, stride_(round(cols * 4, ALIGN))
	, data_(std::make_unique<uint8_t[]>(rows_ * stride_ + ALIGN))
	, image_(static_cast<void*>(align(data_.get(), ALIGN)))
	{
	}

	/**
	* Creates the image by copying a CV matrix.
	*/
	explicit AlignedImage(const cv::Mat &mat)
	: AlignedImage(mat.rows, mat.cols)
	{
	assert(mat.channels() == 1);
	assert(mat.type() == CV_32FC1);

	for (int i = 0; i < mat.rows; ++i) {
	memcpy(this->operator[](i), mat.ptr<float>(i), mat.cols * sizeof(float));
	}
	}

	/**
	* Converts the image to an OpenCV matrix.
	*/
	operator cv::Mat () {
	// Make sure to clone the data so the matrix can be used even if the
	// image goes out of scope and the unique pointer is deallocated.
	return cv::Mat(rows_, cols_, CV_32FC1, image_, stride_).clone();
	}

	float *operator[] (size_t n) {
	assert(n < rows_);
	return static_cast<float>(image_) + n (stride_ / sizeof(float));
	}

	const float *operator[] (size_t n) const {
	return (const_cast<AlignedImage*>(this))->operator[](n);
	}


	// Accessors for properties.
	int getRows() const { return rows_; }
	int getCols() const { return cols_; }

	private:
	// Size of the image.
	int rows_;
	int cols_;

	// The stride represents the length of a single row, multiple of 32 bytes.
	int stride_;

	// Raw, unaligned data.
	std::unique_ptr<uint8_t[]> data_;

	// 32-byte aligned offset in the buffer.
	void *image_;
	};


	/**
	* Applies the bilateral filter on an image.
	*
	* This implementation follows the one from kfusion:
	* https://github.com/GerhardR/kfusion/blob/master/kfusion.cu#L133
	*/
	void bilateralFilterNaive(const AlignedImage &in, AlignedImage &out) {
	assert(in.getRows() == out.getRows());
	assert(in.getCols() == out.getCols());

	for (int r = 2; r < in.getRows() - 2; ++r) {
	for (int c = 2; c < in.getCols() - 2; ++c) {
	float center = in[r][c];

	float accum = 0.0f;
	float weight = 0.0f;

	for (int i = -2; i <= 2; ++i) {
	for (int j = -2; j <= 2; ++j) {
	float pix = in[r + i][c + j];

	float d = pix - center;
	float w = G[i + 2] * G[j + 2] * exp(-(d * d) / (2 * SIGMA * SIGMA));

	accum += w * pix;
	weight += w;
	}
	}

	out[r][c] = accum / weight;
	}
	}
	}


	/**
	* Applies an AVX optimized bilateral filter to an image.
	*/
	void bilateralFilterAVX(const AlignedImage &in, AlignedImage &out) {
	assert(in.getRows() == out.getRows());
	assert(in.getCols() == out.getCols());

	for (int r = 2; r < in.getRows() - 2; ++r) {
	for (int c = 2; c < in.getCols() - 2; ++c) {
	out[r][c] = in[r][c];
	}
	}
	}


	/**
	* Entry point of the application. Loads an image & smooths it.
	*/
	int main(int argc, char **argv) {

	if (argc != 3) {
	std::cerr << "Usage: simd [image] [avx\|naive]" << std::endl;
	return EXIT_FAILURE;
	}

	// Load the image using OpenCV and convert it to grayscale float.
	cv::Mat gray;
	{
	const auto rgb = cv::imread(argv[1], cv::IMREAD_COLOR);
	cv::cvtColor(rgb, gray, CV_BGR2GRAY);
	gray.convertTo(gray, CV_32FC1);
	gray = gray / 255.0f;
	}

	// Create the input and output images.
	AlignedImage input(gray);
	AlignedImage output(gray.rows, gray.cols);

	// Run the bilateral filter.
	{
	const auto start = std::chrono::high_resolution_clock::now();
	{
	const std::string method(argv[2]);
	if (method == "naive") {
	bilateralFilterNaive(input, output);
	} else if (method == "avx") {
	bilateralFilterAVX(input, output);
	} else {
	std::cerr << "Invalid method." << std::endl;
	return EXIT_FAILURE;
	}
	}
	const auto end = std::chrono::high_resolution_clock::now();

	std::cout
	<< "Duration: "
	<< (std::chrono::duration<double>(end - start)).count()
	<< std::endl;
	}

	// Show the output image.
	cv::imshow("input", (cv::Mat)input);
	cv::imshow("output", (cv::Mat)output);
	while (cv::waitKey(0) != 'q');

	return EXIT_SUCCESS;
	}