neoblizz/sparsify.cu

## sparsify.cu
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <random>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>

float get_random() {
  static std::default_random_engine e;
  static std::uniform_real_distribution<> dis(0, 1);  // rage 0 - 1
  return dis(e);
}

int main(int argc, char** argv) {
  using weight_t = float;

  // Weight Matrix (m x n)
  constexpr std::size_t m = 32;
  constexpr std::size_t n = 32;

  thrust::host_vector<weight_t> h_weights(n * m);

  // Generate random weights
  srand((unsigned)time(0));
  for (auto& weight : h_weights)
    weight = get_random();

  // Move the data to GPU
  thrust::device_vector<weight_t> d_weights = h_weights;
  auto weights = d_weights.data().get();  // pointer to data

  // Block configuration (2 x 2)
  constexpr std::size_t blk_m = 2;
  constexpr std::size_t blk_n = 2;
  constexpr std::size_t blk_size = blk_m * blk_n;

  // Tile configuration (Number of blocks per m, n)
  constexpr std::size_t tile_m = m / blk_m;
  constexpr std::size_t tile_n = n / blk_n;

  // Sparsify lambda (50%)
  float sparsity_factor = 0.5;
  std::size_t number_of_zeros_per_block = floor(blk_size * sparsity_factor);

  auto sparsify = [=] __device__(std::size_t const& blk_idx) {
    // Global idx strided by blk_idx
    auto global_idx = blk_idx * blk_size;
    std::size_t sparsified = 0;

    // Block idx as (m, n)
    // auto blk_m_idx = blk_idx % blk_n;
    // auto blk_n_idx = blk_idx / blk_n;

    // Loop over the (2 x 2) block
    for (std::size_t h = 0; h < blk_m; ++h) {
      for (std::size_t w = 0; w < blk_n; ++w) {
        if (sparsified == number_of_zeros_per_block)
          break;

        // <todo> need a good condition to determine
        // if a value should be sparsified.
        auto idx = global_idx + h + (w * blk_n);
        weights[idx] = (weight_t)0;
        sparsified++;
      }
    }

    return 0;
  };

  // Kernel launch using transform
  cudaStream_t stream = 0;
  thrust::transform(
      thrust::cuda::par.on(stream),                    // CUDA stream
      thrust::make_counting_iterator<std::size_t>(0),  // Begin iterator: 0
      thrust::make_counting_iterator<std::size_t>(
          tile_m * tile_n),             // End iterator: tile_m * tile_n
      thrust::make_discard_iterator(),  // Discard output
      sparsify                          // Unary Operator
  );

  // Log and output
  std::cout << "Matrix Size (m, n) = (" << m << ", " << n << ")" << std::endl;
  std::cout << "Sparsity Factor = " << sparsity_factor * 100 << "%"
            << std::endl;
  std::cout << "Number of Nonzeros Per Block = " << number_of_zeros_per_block
            << std::endl;
  std::cout << "Block Size blk_(m, n) = (" << blk_m << ", " << blk_n << ")"
            << std::endl;
  std::cout << "Number of Blocks = " << blk_size << std::endl;
  std::cout << "Weights (sparsified) = " << std::endl;
  thrust::copy(d_weights.begin(), d_weights.end(),
               std::ostream_iterator<weight_t>(std::cout, " "));
  std::cout << std::endl;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <ctime>
	#include <random>

	#include <thrust/device_vector.h>
	#include <thrust/host_vector.h>
	#include <thrust/transform.h>

	#include <thrust/iterator/counting_iterator.h>
	#include <thrust/iterator/discard_iterator.h>

	float get_random() {
	static std::default_random_engine e;
	static std::uniform_real_distribution<> dis(0, 1); // rage 0 - 1
	return dis(e);
	}

	int main(int argc, char** argv) {
	using weight_t = float;

	// Weight Matrix (m x n)
	constexpr std::size_t m = 32;
	constexpr std::size_t n = 32;

	thrust::host_vector<weight_t> h_weights(n * m);

	// Generate random weights
	srand((unsigned)time(0));
	for (auto& weight : h_weights)
	weight = get_random();

	// Move the data to GPU
	thrust::device_vector<weight_t> d_weights = h_weights;
	auto weights = d_weights.data().get(); // pointer to data

	// Block configuration (2 x 2)
	constexpr std::size_t blk_m = 2;
	constexpr std::size_t blk_n = 2;
	constexpr std::size_t blk_size = blk_m * blk_n;

	// Tile configuration (Number of blocks per m, n)
	constexpr std::size_t tile_m = m / blk_m;
	constexpr std::size_t tile_n = n / blk_n;

	// Sparsify lambda (50%)
	float sparsity_factor = 0.5;
	std::size_t number_of_zeros_per_block = floor(blk_size * sparsity_factor);

	auto sparsify = [=] __device__(std::size_t const& blk_idx) {
	// Global idx strided by blk_idx
	auto global_idx = blk_idx * blk_size;
	std::size_t sparsified = 0;

	// Block idx as (m, n)
	// auto blk_m_idx = blk_idx % blk_n;
	// auto blk_n_idx = blk_idx / blk_n;

	// Loop over the (2 x 2) block
	for (std::size_t h = 0; h < blk_m; ++h) {
	for (std::size_t w = 0; w < blk_n; ++w) {
	if (sparsified == number_of_zeros_per_block)
	break;

	// <todo> need a good condition to determine
	// if a value should be sparsified.
	auto idx = global_idx + h + (w * blk_n);
	weights[idx] = (weight_t)0;
	sparsified++;
	}
	}

	return 0;
	};

	// Kernel launch using transform
	cudaStream_t stream = 0;
	thrust::transform(
	thrust::cuda::par.on(stream), // CUDA stream
	thrust::make_counting_iterator<std::size_t>(0), // Begin iterator: 0
	thrust::make_counting_iterator<std::size_t>(
	tile_m * tile_n), // End iterator: tile_m * tile_n
	thrust::make_discard_iterator(), // Discard output
	sparsify // Unary Operator
	);

	// Log and output
	std::cout << "Matrix Size (m, n) = (" << m << ", " << n << ")" << std::endl;
	std::cout << "Sparsity Factor = " << sparsity_factor * 100 << "%"
	<< std::endl;
	std::cout << "Number of Nonzeros Per Block = " << number_of_zeros_per_block
	<< std::endl;
	std::cout << "Block Size blk_(m, n) = (" << blk_m << ", " << blk_n << ")"
	<< std::endl;
	std::cout << "Number of Blocks = " << blk_size << std::endl;
	std::cout << "Weights (sparsified) = " << std::endl;
	thrust::copy(d_weights.begin(), d_weights.end(),
	std::ostream_iterator<weight_t>(std::cout, " "));
	std::cout << std::endl;
	}