Created
May 31, 2016 09:12
-
-
Save Mochimazui/007b243e7be16527f5d5380d596c1993 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <thrust/system/cuda/vector.h> | |
#include <thrust/system/cuda/execution_policy.h> | |
#include <thrust/host_vector.h> | |
#include <thrust/generate.h> | |
#include <thrust/sort.h> | |
#include <thrust/scan.h> | |
#include <thrust/pair.h> | |
#include <cstdlib> | |
#include <iostream> | |
#include <map> | |
#include <cassert> | |
// This example demonstrates how to intercept calls to get_temporary_buffer | |
// and return_temporary_buffer to control how Thrust allocates temporary storage | |
// during algorithms such as thrust::sort. The idea will be to create a simple | |
// cache of allocations to search when temporary storage is requested. If a hit | |
// is found in the cache, we quickly return the cached allocation instead of | |
// resorting to the more expensive thrust::cuda::malloc. | |
// | |
// Note: this implementation cached_allocator is not thread-safe. If multiple | |
// (host) threads use the same cached_allocator then they should gain exclusive | |
// access to the allocator before accessing its methods. | |
// cached_allocator: a simple allocator for caching allocation requests | |
class cached_allocator | |
{ | |
public: | |
// just allocate bytes | |
typedef char value_type; | |
cached_allocator() {} | |
~cached_allocator() | |
{ | |
// free all allocations when cached_allocator goes out of scope | |
free_all(); | |
} | |
char *allocate(std::ptrdiff_t num_bytes) | |
{ | |
char *result = 0; | |
// search the cache for a free block | |
free_blocks_type::iterator free_block = free_blocks.find(num_bytes); | |
if (free_block != free_blocks.end()) | |
{ | |
std::cout << "cached_allocator::allocator(): found a hit" << std::endl; | |
// get the pointer | |
result = free_block->second; | |
// erase from the free_blocks map | |
free_blocks.erase(free_block); | |
} | |
else | |
{ | |
// no allocation of the right size exists | |
// create a new one with cuda::malloc | |
// throw if cuda::malloc can't satisfy the request | |
try | |
{ | |
std::cout << "cached_allocator::allocator(): no free block found; calling cuda::malloc" << std::endl; | |
// allocate memory and convert cuda::pointer to raw pointer | |
result = thrust::cuda::malloc<char>(num_bytes).get(); | |
} | |
catch (std::runtime_error &e) | |
{ | |
throw; | |
} | |
} | |
// insert the allocated pointer into the allocated_blocks map | |
allocated_blocks.insert(std::make_pair(result, num_bytes)); | |
return result; | |
} | |
void deallocate(char *ptr, size_t n) | |
{ | |
// erase the allocated block from the allocated blocks map | |
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr); | |
std::ptrdiff_t num_bytes = iter->second; | |
allocated_blocks.erase(iter); | |
// insert the block into the free blocks map | |
free_blocks.insert(std::make_pair(num_bytes, ptr)); | |
} | |
private: | |
typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type; | |
typedef std::map<char *, std::ptrdiff_t> allocated_blocks_type; | |
free_blocks_type free_blocks; | |
allocated_blocks_type allocated_blocks; | |
void free_all() | |
{ | |
std::cout << "cached_allocator::free_all(): cleaning up after ourselves..." << std::endl; | |
// deallocate all outstanding blocks in both lists | |
for (free_blocks_type::iterator i = free_blocks.begin(); | |
i != free_blocks.end(); | |
++i) | |
{ | |
// transform the pointer to cuda::pointer before calling cuda::free | |
thrust::cuda::free(thrust::cuda::pointer<char>(i->second)); | |
} | |
for (allocated_blocks_type::iterator i = allocated_blocks.begin(); | |
i != allocated_blocks.end(); | |
++i) | |
{ | |
// transform the pointer to cuda::pointer before calling cuda::free | |
thrust::cuda::free(thrust::cuda::pointer<char>(i->first)); | |
} | |
} | |
}; | |
int main() | |
{ | |
#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400) | |
std::cout << "This feature requires gcc >= 4.4" << std::endl; | |
return 0; | |
#endif | |
size_t n = 1 << 22; | |
thrust::host_vector<int> h_input(n); | |
// generate random input | |
thrust::generate(h_input.begin(), h_input.end(), rand); | |
thrust::cuda::vector<int> d_input = h_input; | |
thrust::cuda::vector<int> d_result(n); | |
size_t num_trials = 5; | |
// create a cached_allocator object | |
cached_allocator alloc; | |
for (size_t i = 0; i < num_trials; ++i) | |
{ | |
// initialize data to sort | |
d_result = d_input; | |
// pass alloc through cuda::par as the first parameter to sort | |
// to cause allocations to be handled by alloc during sort | |
//thrust::sort(thrust::cuda::par(alloc), d_result.begin(), d_result.end()); | |
thrust::exclusive_scan(thrust::cuda::par(alloc), d_result.begin(), d_result.end(), d_result.begin(), 0); | |
// ensure the result is sorted | |
assert(thrust::is_sorted(d_result.begin(), d_result.end())); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment