Skip to content

Instantly share code, notes, and snippets.

@Mochimazui
Created May 31, 2016 09:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mochimazui/007b243e7be16527f5d5380d596c1993 to your computer and use it in GitHub Desktop.
Save Mochimazui/007b243e7be16527f5d5380d596c1993 to your computer and use it in GitHub Desktop.
#include <thrust/system/cuda/vector.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/scan.h>
#include <thrust/pair.h>
#include <cstdlib>
#include <iostream>
#include <map>
#include <cassert>
// This example demonstrates how to intercept calls to get_temporary_buffer
// and return_temporary_buffer to control how Thrust allocates temporary storage
// during algorithms such as thrust::sort. The idea will be to create a simple
// cache of allocations to search when temporary storage is requested. If a hit
// is found in the cache, we quickly return the cached allocation instead of
// resorting to the more expensive thrust::cuda::malloc.
//
// Note: this implementation cached_allocator is not thread-safe. If multiple
// (host) threads use the same cached_allocator then they should gain exclusive
// access to the allocator before accessing its methods.
// cached_allocator: a simple allocator for caching allocation requests
class cached_allocator
{
public:
// just allocate bytes
typedef char value_type;
cached_allocator() {}
~cached_allocator()
{
// free all allocations when cached_allocator goes out of scope
free_all();
}
char *allocate(std::ptrdiff_t num_bytes)
{
char *result = 0;
// search the cache for a free block
free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
if (free_block != free_blocks.end())
{
std::cout << "cached_allocator::allocator(): found a hit" << std::endl;
// get the pointer
result = free_block->second;
// erase from the free_blocks map
free_blocks.erase(free_block);
}
else
{
// no allocation of the right size exists
// create a new one with cuda::malloc
// throw if cuda::malloc can't satisfy the request
try
{
std::cout << "cached_allocator::allocator(): no free block found; calling cuda::malloc" << std::endl;
// allocate memory and convert cuda::pointer to raw pointer
result = thrust::cuda::malloc<char>(num_bytes).get();
}
catch (std::runtime_error &e)
{
throw;
}
}
// insert the allocated pointer into the allocated_blocks map
allocated_blocks.insert(std::make_pair(result, num_bytes));
return result;
}
void deallocate(char *ptr, size_t n)
{
// erase the allocated block from the allocated blocks map
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
// insert the block into the free blocks map
free_blocks.insert(std::make_pair(num_bytes, ptr));
}
private:
typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
typedef std::map<char *, std::ptrdiff_t> allocated_blocks_type;
free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;
void free_all()
{
std::cout << "cached_allocator::free_all(): cleaning up after ourselves..." << std::endl;
// deallocate all outstanding blocks in both lists
for (free_blocks_type::iterator i = free_blocks.begin();
i != free_blocks.end();
++i)
{
// transform the pointer to cuda::pointer before calling cuda::free
thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
}
for (allocated_blocks_type::iterator i = allocated_blocks.begin();
i != allocated_blocks.end();
++i)
{
// transform the pointer to cuda::pointer before calling cuda::free
thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
}
}
};
int main()
{
#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400)
std::cout << "This feature requires gcc >= 4.4" << std::endl;
return 0;
#endif
size_t n = 1 << 22;
thrust::host_vector<int> h_input(n);
// generate random input
thrust::generate(h_input.begin(), h_input.end(), rand);
thrust::cuda::vector<int> d_input = h_input;
thrust::cuda::vector<int> d_result(n);
size_t num_trials = 5;
// create a cached_allocator object
cached_allocator alloc;
for (size_t i = 0; i < num_trials; ++i)
{
// initialize data to sort
d_result = d_input;
// pass alloc through cuda::par as the first parameter to sort
// to cause allocations to be handled by alloc during sort
//thrust::sort(thrust::cuda::par(alloc), d_result.begin(), d_result.end());
thrust::exclusive_scan(thrust::cuda::par(alloc), d_result.begin(), d_result.end(), d_result.begin(), 0);
// ensure the result is sorted
assert(thrust::is_sorted(d_result.begin(), d_result.end()));
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment