Skip to content

Instantly share code, notes, and snippets.

@neoblizz
Last active May 4, 2021
Embed
What would you like to do?
CUDA-based implementation to introduce sparsity.
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <random>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
float get_random() {
static std::default_random_engine e;
static std::uniform_real_distribution<> dis(0, 1); // rage 0 - 1
return dis(e);
}
int main(int argc, char** argv) {
using weight_t = float;
// Weight Matrix (m x n)
constexpr std::size_t m = 32;
constexpr std::size_t n = 32;
thrust::host_vector<weight_t> h_weights(n * m);
// Generate random weights
srand((unsigned)time(0));
for (auto& weight : h_weights)
weight = get_random();
// Move the data to GPU
thrust::device_vector<weight_t> d_weights = h_weights;
auto weights = d_weights.data().get(); // pointer to data
// Block configuration (2 x 2)
constexpr std::size_t blk_m = 2;
constexpr std::size_t blk_n = 2;
constexpr std::size_t blk_size = blk_m * blk_n;
// Tile configuration (Number of blocks per m, n)
constexpr std::size_t tile_m = m / blk_m;
constexpr std::size_t tile_n = n / blk_n;
// Sparsify lambda (50%)
float sparsity_factor = 0.5;
std::size_t number_of_zeros_per_block = floor(blk_size * sparsity_factor);
auto sparsify = [=] __device__(std::size_t const& blk_idx) {
// Global idx strided by blk_idx
auto global_idx = blk_idx * blk_size;
std::size_t sparsified = 0;
// Block idx as (m, n)
// auto blk_m_idx = blk_idx % blk_n;
// auto blk_n_idx = blk_idx / blk_n;
// Loop over the (2 x 2) block
for (std::size_t h = 0; h < blk_m; ++h) {
for (std::size_t w = 0; w < blk_n; ++w) {
if (sparsified == number_of_zeros_per_block)
break;
// <todo> need a good condition to determine
// if a value should be sparsified.
auto idx = global_idx + h + (w * blk_n);
weights[idx] = (weight_t)0;
sparsified++;
}
}
return 0;
};
// Kernel launch using transform
cudaStream_t stream = 0;
thrust::transform(
thrust::cuda::par.on(stream), // CUDA stream
thrust::make_counting_iterator<std::size_t>(0), // Begin iterator: 0
thrust::make_counting_iterator<std::size_t>(
tile_m * tile_n), // End iterator: tile_m * tile_n
thrust::make_discard_iterator(), // Discard output
sparsify // Unary Operator
);
// Log and output
std::cout << "Matrix Size (m, n) = (" << m << ", " << n << ")" << std::endl;
std::cout << "Sparsity Factor = " << sparsity_factor * 100 << "%"
<< std::endl;
std::cout << "Number of Nonzeros Per Block = " << number_of_zeros_per_block
<< std::endl;
std::cout << "Block Size blk_(m, n) = (" << blk_m << ", " << blk_n << ")"
<< std::endl;
std::cout << "Number of Blocks = " << blk_size << std::endl;
std::cout << "Weights (sparsified) = " << std::endl;
thrust::copy(d_weights.begin(), d_weights.end(),
std::ostream_iterator<weight_t>(std::cout, " "));
std::cout << std::endl;
}
@neoblizz
Copy link
Author

neoblizz commented Mar 12, 2021

Compile and run;

nvcc -o sparsify sparsify.cu --extended-lambda -O3 --use_fast_math
./sparsify

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment