Skip to content

Instantly share code, notes, and snippets.

@marty1885
Created January 8, 2020 02:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marty1885/df0f40e6911a244ae9026d36538dc8ad to your computer and use it in GitHub Desktop.
Save marty1885/df0f40e6911a244ae9026d36538dc8ad to your computer and use it in GitHub Desktop.
#include <vector>
#include <algorithm>
#include <chrono>
#include <random>
#include <cstdint>
#include <cassert>
// Banchmarking library
#include <hayai/hayai.hpp>
// Testing parameters
constexpr int width = 512, height = 512;
constexpr int local_area = 32;
constexpr int stride = 1;
constexpr float local_density = 0.1;
constexpr int out_width = (width-local_area)/stride+1;
constexpr int out_height = (height-local_area)/stride+1;
std::vector<int> activation;
std::vector<uint8_t> output;
std::vector<std::pair<int, int>> local_values;
BENCHMARK(local_inhib, local_inhib_base, 1, 1)
{
int num_active_bits = local_area*local_area*local_density;
// The baseline algorithm works like max pooling, a naive version
// Performance on my box is ~2.4 iterations/s
for(int y=0;y<height-local_area;y+=stride) {
for(int x=0;x<width-local_area;x+=stride) {
// Collect the nearby cell index and activations
for(int dy=0;dy<local_area;dy++) {
for(int dx=0;dx<local_area;dx++) {
int index = (y+dy)*width+x+dx;
local_values[dy*local_area+dx] = {index, activation[index]};
}
}
int output_index = (y/stride)*out_width+(x/stride);
// Do a partial sort to put the top K values in the top K slots
std::nth_element(local_values.begin(), local_values.begin()+num_active_bits, local_values.end()
, [](const auto& a, const auto& b) {return a.second > b.second;});
// If the current cell is in the top K slots
if(std::find_if(local_values.begin(), local_values.begin()+num_active_bits
, [&](const auto& a){ return a.first == (y*width+x); }) != local_values.begin()+num_active_bits)
output[output_index] = 1;
else
output[output_index] = 0;
}
}
}
// Optimization 3: Instrad of storing both index and activation. Since we are just dealing with current
// cell values. We don't need index anymore. But this only provide noise level
// improvments.
std::vector<int> local_activations;
BENCHMARK(local_inhib, local_inhib_optim, 1, 1)
{
// Performance on my box is ~45.3 iterations/s
int num_active_bits = local_area*local_area*local_density;
for(int x=0;x<width-local_area;x+=stride) {
for(int y=0;y<height-local_area;y+=stride) {
int target_value = 0;
// Optimization 2: Reuse previouslly aquired vaules
int n = 0;
if(y == 0) { // If y == 0. i.e. First time in the column
target_value = activation[y*width+x];
for(int dy=0;dy<local_area;dy++) {
for(int dx=0;dx<local_area;dx++) {
int index = (y+dy)*width+x+dx;
local_activations[dy*local_area+dx] = activation[index];
// Optimization 5: Instrad of computing how many cells are more active then out current one
// after done with local_activations. We compute it on the fly. Reducing memory
// access.
if(activation[index] > target_value)
n += 1;
}
}
}
else { // Reuse old data when possible
int dy = (y-1)%local_area;
// Optimization 4: For some reason using std::copy is faster then a for loop
std::copy(
activation.begin() + (y+dy)*width+x,
activation.begin() + (y+dy)*width+x+local_area,
local_activations.begin() + dy*local_area
);
target_value = local_activations[dy*local_area];
// Optimization 1: Instead of sorting and finding if the current cell is in the top-K cells
// we count how many cells has greater activations comapred to the current one.
n = std::count_if(local_activations.begin(), local_activations.end(), [&](int act){ return act > target_value; });
}
int output_index = (y/stride)*out_width+(x/stride);
if(n < num_active_bits)
output[output_index] = 1;
else
output[output_index] = 0;
}
}
}
int main()
{
//Initalize activation
activation = std::vector<int>(width*height);
std::mt19937 rng;
std::uniform_int_distribution<int> dist(0, 16);
std::generate(activation.begin(), activation.end(), [&](){ return bool(dist(rng)); });
// Initalize buffers
output = std::vector<uint8_t>(out_width*out_height);
local_values.resize(local_area*local_area);
local_activations.resize(local_area*local_area);
hayai::ConsoleOutputter consoleOutputter;
hayai::Benchmarker::AddOutputter(consoleOutputter);
hayai::Benchmarker::RunAllTests();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment