jasorod/blue_noise_gen.cu

## blue_noise_gen.cu
/*
 * Blue noise dithered sampling algorithm based on the paper "Blue-noise Dithered Sampling" by Georgiev and Fajardo
 * Uses simulated anealing to calculate a blue noise image from a random set of values
 *
 * Program will output a PGM file for the first dimension of the random noise image and the blue noise image
 *
 * compile: nvcc -O3 --std=c++14 -o blue_noise_gen blue_noise_gen.cu
 *
 */

#include <cuda_runtime.h>
#include <math.h>
#include <cstring>
#include <fstream>
#include <iostream>
#include <random>
#include <limits>

/*
 * adjust these values depending on the size of the image and the dimensions of the vector you want, the
 * threshold for when the algorithm should stop, and the bit-depth of the output PGM
 */

constexpr int image_width = 256;
constexpr int image_height = 256;
constexpr float variance = 2.1f;
constexpr int kernel_range = 5;
constexpr int iteration_miss_threshold = 5000000;
constexpr unsigned int dimensions = 1;
constexpr int max_output_value = std::numeric_limits<unsigned char>::max();

//leave these values constant
constexpr int cuda_block_size = 32;
constexpr float epsilon = 1e-5;

template<unsigned int dimesion = 1>
struct FloatVectorType {
    static constexpr unsigned int dim = dimension;
    float data[dimension];
} __attribute__ ((aligned(4)));

bool temp_probability(float temp, float prev_state, float current_state, float probability) {
    return std::exp(-1 * (current_state - prev_state) / temp) > probability && std::fabs(prev_state - current_state) > epsilon;
}

template<typename T>
__inline__ __device__ T warp_reduce_sum(T val) {
    for (int offset = warpSize/2; offset > 0; offset /= 2) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }

    return val;
}

template<typename T>
__global__ void reduce_sum(T *input_array, T* out_value, int N) {
    T sum = T(0);
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index < N) {
        sum = warp_reduce_sum(input_array[index]);
    }

    if ((threadIdx.x & (warpSize - 1)) == 0) {
        atomicAdd(out_value, sum);
    }
}

template<typename VectorType>
__device__ float norm_diff(VectorType const& a, VectorType const& b) {
    float diff[VectorType::dim];

    for (int i = 0; i < VectorType::dim; i++) {
        diff[i] = b.data[i] - a.data[i];
    }

    return normf(VectorType::dim, diff);
}

template<typename VectorType>
__global__ void energy_function(VectorType const *input_array, float *energy_array, int width, int height, int kernel_range, float variance) {
    const int col = threadIdx.x + blockIdx.x * blockDim.x;
    const int row = threadIdx.y + blockIdx.y * blockDim.y;

    if (col < width && row < height) {
        VectorType input_source_sample = input_array[col + row * width];
        energy_array[col + row * width] = 0.0f;

        for (int i = 1 - kernel_range; i < kernel_range; i++) {
            for (int j = 1 - kernel_range; j < kernel_range; j++) {
                int input_index_x = (width + ((col + j) % width)) % width;
                int input_index_y = (height + ((row + i) % height)) % height;
                VectorType input_kernel_sample = input_array[input_index_x + input_index_y * width];

                float pixel_gaussian = (powf(static_cast<float>(col + j) - static_cast<float>(col), 2.0f) + powf(static_cast<float>(row + i) - static_cast<float>(row), 2.0f)) / powf(variance, 2.0f);
                float vector_gaussian = powf(norm_diff(input_source_sample, input_kernel_sample), static_cast<float>(VectorType::dim) / 2.0f);
                energy_array[col + row * width] += expf(-1 * (pixel_gaussian + vector_gaussian));
            }
        }
    }
}

template<typename T>
void swap_pixels(T *image_array, int2 location_a, int2 location_b) {
    T temp = image_array[location_a.x + location_a.y * image_width];
    image_array[location_a.x + location_a.y * image_width] = image_array[location_b.x + location_b.y * image_width];
    image_array[location_b.x + location_b.y * image_width] = temp;
}

template<typename VectorType>
void output_pgm(std::string filename, int channel, VectorType const *data_array) {
    std::ofstream pgm(filename.c_str());
    pgm << "P2\n" << image_width << " " << image_height << "\n" << max_output_value << "\n";

    for (int i = 0; i < image_height; i++) {
        for (int j = 0; j < image_width; j++) {
            pgm << static_cast<unsigned int>(data_array[j + i * image_width].data[channel] * max_output_value) << "\n";
        }
    }

    pgm.close();
}

int main(int argc, char** argv) {
    std::random_device rd;
    std::mt19937 generator(rd());
    std::uniform_int_distribution<> rand_int_width_dist(0, image_width - 1);
    std::uniform_int_distribution<> rand_int_height_dist(0, image_height - 1);
    std::uniform_real_distribution<> rand_float_dist(0.0, 1.0);

    FloatVectorType<dimensions> *source_array;
    FloatVectorType<dimensions> *solution_array;

    cudaMallocManaged(&source_array, sizeof(FloatVectorType<dimensions>) * image_width * image_height);
    solution_array = new FloatVectorType<dimensions>[image_width * image_height];

    for (int i = 0; i < image_height; i++){
        for (int j = 0; j < image_width; j++) {
            source_array[j + i * image_width] = FloatVectorType<dimensions>{static_cast<float>(rand_float_dist(generator))};
        }
    }

    std::cerr << "Outputting initial random starting point" << std::endl;
    output_pgm("random_noise.pgm", 0, source_array);

    float current_energy = std::numeric_limits<float>::max();
    float *energy_array = nullptr;
    float *new_energy_value = nullptr;

    cudaMallocManaged(&energy_array, sizeof(float) * image_width * image_height);
    cudaMallocManaged(&new_energy_value, sizeof(float));

    float temperature = 1.0f;
    int2 swap_a_loc;
    int2 swap_b_loc;
    int iteration = 0;
    int iteration_miss = 0;

    dim3 grid_size_2d((image_width + cuda_block_size) / cuda_block_size, (image_height + cuda_block_size) / cuda_block_size);
    dim3 block_size_2d(cuda_block_size, cuda_block_size);

    dim3 grid_size_1d((image_width * image_height + cuda_block_size) / cuda_block_size);
    dim3 block_size_1d(cuda_block_size);

    //iterate until we've gotten to the point where we're no longer getting results
    while (iteration_miss < iteration_miss_threshold) {
        //generate random swap indices
        swap_a_loc.x = rand_int_width_dist(generator);
        swap_a_loc.y = rand_int_height_dist(generator);
        swap_b_loc.x = rand_int_width_dist(generator);
        swap_b_loc.y = rand_int_height_dist(generator);

        //swap two random pixel pairs in the source array
        swap_pixels(source_array, swap_a_loc, swap_b_loc);
        *new_energy_value = 0.0f;

        energy_function<<<grid_size_2d, block_size_2d>>>(source_array, energy_array, image_width, image_height, kernel_range, variance);
        reduce_sum<<<grid_size_1d, block_size_1d>>>(energy_array, new_energy_value, image_width * image_height);

        cudaDeviceSynchronize();

        //if the new energy less than the current energy or temp heuristic is true, assign the current source array as the new solution array, otherwise undo the pixel swap and start again
        if (*new_energy_value < current_energy || temp_probability(temperature, current_energy, *new_energy_value, rand_float_dist(generator))) {
            std::cerr << "[Run #" << iteration << "] found new energy result: " << *new_energy_value << std::endl;
            current_energy = *new_energy_value;
            cudaMemcpy(solution_array, source_array, sizeof(FloatVectorType<dimensions>) * image_width * image_height, cudaMemcpyDefault);
            iteration_miss = 0;
        } else {
            swap_pixels(source_array, swap_a_loc, swap_b_loc);
            iteration_miss++;
        }

        temperature *= 0.995f;
        iteration++;
    }

    std::cerr << "[Run #" << iteration << "] finished simulation, outputting PGM file" << std::endl;
    output_pgm("blue_noise.pgm", 0, solution_array);
}
	/*
	* Blue noise dithered sampling algorithm based on the paper "Blue-noise Dithered Sampling" by Georgiev and Fajardo
	* Uses simulated anealing to calculate a blue noise image from a random set of values
	*
	* Program will output a PGM file for the first dimension of the random noise image and the blue noise image
	*
	* compile: nvcc -O3 --std=c++14 -o blue_noise_gen blue_noise_gen.cu
	*
	*/

	#include <cuda_runtime.h>
	#include <math.h>
	#include <cstring>
	#include <fstream>
	#include <iostream>
	#include <random>
	#include <limits>

	/*
	* adjust these values depending on the size of the image and the dimensions of the vector you want, the
	* threshold for when the algorithm should stop, and the bit-depth of the output PGM
	*/

	constexpr int image_width = 256;
	constexpr int image_height = 256;
	constexpr float variance = 2.1f;
	constexpr int kernel_range = 5;
	constexpr int iteration_miss_threshold = 5000000;
	constexpr unsigned int dimensions = 1;
	constexpr int max_output_value = std::numeric_limits<unsigned char>::max();

	//leave these values constant
	constexpr int cuda_block_size = 32;
	constexpr float epsilon = 1e-5;

	template<unsigned int dimesion = 1>
	struct FloatVectorType {
	static constexpr unsigned int dim = dimension;
	float data[dimension];
	} __attribute__ ((aligned(4)));

	bool temp_probability(float temp, float prev_state, float current_state, float probability) {
	return std::exp(-1 * (current_state - prev_state) / temp) > probability && std::fabs(prev_state - current_state) > epsilon;
	}

	template<typename T>
	__inline__ __device__ T warp_reduce_sum(T val) {
	for (int offset = warpSize/2; offset > 0; offset /= 2) {
	val += __shfl_down_sync(0xffffffff, val, offset);
	}

	return val;
	}

	template<typename T>
	__global__ void reduce_sum(T input_array, T out_value, int N) {
	T sum = T(0);
	int index = blockIdx.x * blockDim.x + threadIdx.x;

	if (index < N) {
	sum = warp_reduce_sum(input_array[index]);
	}

	if ((threadIdx.x & (warpSize - 1)) == 0) {
	atomicAdd(out_value, sum);
	}
	}

	template<typename VectorType>
	__device__ float norm_diff(VectorType const& a, VectorType const& b) {
	float diff[VectorType::dim];

	for (int i = 0; i < VectorType::dim; i++) {
	diff[i] = b.data[i] - a.data[i];
	}

	return normf(VectorType::dim, diff);
	}

	template<typename VectorType>
	__global__ void energy_function(VectorType const input_array, float energy_array, int width, int height, int kernel_range, float variance) {
	const int col = threadIdx.x + blockIdx.x * blockDim.x;
	const int row = threadIdx.y + blockIdx.y * blockDim.y;

	if (col < width && row < height) {
	VectorType input_source_sample = input_array[col + row * width];
	energy_array[col + row * width] = 0.0f;

	for (int i = 1 - kernel_range; i < kernel_range; i++) {
	for (int j = 1 - kernel_range; j < kernel_range; j++) {
	int input_index_x = (width + ((col + j) % width)) % width;
	int input_index_y = (height + ((row + i) % height)) % height;
	VectorType input_kernel_sample = input_array[input_index_x + input_index_y * width];

	float pixel_gaussian = (powf(static_cast<float>(col + j) - static_cast<float>(col), 2.0f) + powf(static_cast<float>(row + i) - static_cast<float>(row), 2.0f)) / powf(variance, 2.0f);
	float vector_gaussian = powf(norm_diff(input_source_sample, input_kernel_sample), static_cast<float>(VectorType::dim) / 2.0f);
	energy_array[col + row * width] += expf(-1 * (pixel_gaussian + vector_gaussian));
	}
	}
	}
	}

	template<typename T>
	void swap_pixels(T *image_array, int2 location_a, int2 location_b) {
	T temp = image_array[location_a.x + location_a.y * image_width];
	image_array[location_a.x + location_a.y * image_width] = image_array[location_b.x + location_b.y * image_width];
	image_array[location_b.x + location_b.y * image_width] = temp;
	}

	template<typename VectorType>
	void output_pgm(std::string filename, int channel, VectorType const *data_array) {
	std::ofstream pgm(filename.c_str());
	pgm << "P2\n" << image_width << " " << image_height << "\n" << max_output_value << "\n";

	for (int i = 0; i < image_height; i++) {
	for (int j = 0; j < image_width; j++) {
	pgm << static_cast<unsigned int>(data_array[j + i * image_width].data[channel] * max_output_value) << "\n";
	}
	}

	pgm.close();
	}

	int main(int argc, char** argv) {
	std::random_device rd;
	std::mt19937 generator(rd());
	std::uniform_int_distribution<> rand_int_width_dist(0, image_width - 1);
	std::uniform_int_distribution<> rand_int_height_dist(0, image_height - 1);
	std::uniform_real_distribution<> rand_float_dist(0.0, 1.0);

	FloatVectorType<dimensions> *source_array;
	FloatVectorType<dimensions> *solution_array;

	cudaMallocManaged(&source_array, sizeof(FloatVectorType<dimensions>) * image_width * image_height);
	solution_array = new FloatVectorType<dimensions>[image_width * image_height];

	for (int i = 0; i < image_height; i++){
	for (int j = 0; j < image_width; j++) {
	source_array[j + i * image_width] = FloatVectorType<dimensions>{static_cast<float>(rand_float_dist(generator))};
	}
	}

	std::cerr << "Outputting initial random starting point" << std::endl;
	output_pgm("random_noise.pgm", 0, source_array);

	float current_energy = std::numeric_limits<float>::max();
	float *energy_array = nullptr;
	float *new_energy_value = nullptr;

	cudaMallocManaged(&energy_array, sizeof(float) * image_width * image_height);
	cudaMallocManaged(&new_energy_value, sizeof(float));

	float temperature = 1.0f;
	int2 swap_a_loc;
	int2 swap_b_loc;
	int iteration = 0;
	int iteration_miss = 0;

	dim3 grid_size_2d((image_width + cuda_block_size) / cuda_block_size, (image_height + cuda_block_size) / cuda_block_size);
	dim3 block_size_2d(cuda_block_size, cuda_block_size);

	dim3 grid_size_1d((image_width * image_height + cuda_block_size) / cuda_block_size);
	dim3 block_size_1d(cuda_block_size);

	//iterate until we've gotten to the point where we're no longer getting results
	while (iteration_miss < iteration_miss_threshold) {
	//generate random swap indices
	swap_a_loc.x = rand_int_width_dist(generator);
	swap_a_loc.y = rand_int_height_dist(generator);
	swap_b_loc.x = rand_int_width_dist(generator);
	swap_b_loc.y = rand_int_height_dist(generator);

	//swap two random pixel pairs in the source array
	swap_pixels(source_array, swap_a_loc, swap_b_loc);
	*new_energy_value = 0.0f;

	energy_function<<<grid_size_2d, block_size_2d>>>(source_array, energy_array, image_width, image_height, kernel_range, variance);
	reduce_sum<<<grid_size_1d, block_size_1d>>>(energy_array, new_energy_value, image_width * image_height);

	cudaDeviceSynchronize();

	//if the new energy less than the current energy or temp heuristic is true, assign the current source array as the new solution array, otherwise undo the pixel swap and start again
	if (new_energy_value < current_energy \|\| temp_probability(temperature, current_energy, new_energy_value, rand_float_dist(generator))) {
	std::cerr << "[Run #" << iteration << "] found new energy result: " << *new_energy_value << std::endl;
	current_energy = *new_energy_value;
	cudaMemcpy(solution_array, source_array, sizeof(FloatVectorType<dimensions>) * image_width * image_height, cudaMemcpyDefault);
	iteration_miss = 0;
	} else {
	swap_pixels(source_array, swap_a_loc, swap_b_loc);
	iteration_miss++;
	}

	temperature *= 0.995f;
	iteration++;
	}

	std::cerr << "[Run #" << iteration << "] finished simulation, outputting PGM file" << std::endl;
	output_pgm("blue_noise.pgm", 0, solution_array);
	}