Skip to content

Instantly share code, notes, and snippets.

View brycelelbach's full-sized avatar

Bryce Adelstein Lelbach aka wash brycelelbach

  • NVIDIA
  • Sunnyvale, CA
View GitHub Profile
#include <cassert>
int current_device()
{
int device = 0;
cudaError_t const error = cudaGetDevice(&device);
assert(cudaSuccess == error);
return device;
}
@brycelelbach
brycelelbach / blind_the_submissions.md
Last active September 19, 2019 04:04
Blind the Submissions

Blind the Submissions

Motivation

The first round of reviews of submissions to technical conferences should be double-blind (e.g. reviewers don't know who the submitter is).

Non-double-blind submissions:

  • Contribute to Hero Culture: Hero culture is the tendency within technical
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
// This is how we run libc++ tests on the GPU without modification.
// We force include this header into each test with `-include`.
__host__ __device__
int fake_main(int, char**);
__global__
void fake_main_kernel(int * ret)
{
*ret = fake_main(0, NULL);
// I have this code:
struct thread_group {
private:
std::vector<std::thread> members;
public:
thread_group(thread_group const&) = delete;
thread_group& operator=(thread_group const&) = delete;
// Sort the sequence of integers by the Nth bit.
template <typename ExecutionPolicy,
std::ranges::random_access_range InputRange, random_access_iterator OutputIt>
requires std::integral<typename std::ranges_value_t<InputRange>>
unique_future<std::uint64_t> async_radix_sort_pass(ExecutionPolicy&& exec,
InputRange input, OutputRange output,
std::uint64_t bit)
{
auto const elements = std::distance(input);
template <typename InputIt, typename OutputIt>
OutputIt
radix_sort_split(InputIt first, InputIt last, OutputIt output, std::uint64_t bit)
{
std::vector<std::uint64_t> e(std::distance(first, last));
// Count 0s.
std::transform(first, last, e.begin(),
[=] (auto t) { return !(t & (1 << bit)); });
template <typename InputIterator, typename OutputIterator, typename T, typename BinaryOp>
OutputIterator exclusive_scan(InputIterator first, InputIterator last,
OutputIterator result, T init, BinaryOp op)
{
if (first != last) {
T saved = init;
do {
init = op(init, *first);
*result = saved;
saved = init;
template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size>
unique_future<OutputIt>
async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size)
{
Size const elements = std::distance(first, last);
Size const chunks = (1 + ((elements - 1) / chunk_size)); // Round up.
std::vector<unique_future<T>> sweep;
sweep.reserve(chunks);
template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size>
unique_future<OutputIt>
async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size)
{
Size const elements = std::distance(first, last);
Size const chunks = (1 + ((elements - 1) / chunk_size)); // Round up.
std::vector<unique_future<T>> sweep;
sweep.reserve(chunks);