Bryce Adelstein Lelbach aka wash brycelelbach

## switch_device.cu
#include <cassert>

int current_device()
{
  int device = 0;
  cudaError_t const error = cudaGetDevice(&device);
  assert(cudaSuccess == error);
  return device;
}

## blind_the_submissions.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                brycelelbach
                / blind_the_submissions.md
            
            
              Last active
              September 19, 2019 04:04
            
              
                Blind the Submissions
              
          
    Blind the Submissions

Motivation

The first round of reviews of submissions to technical conferences should be
double-blind (e.g. reviewers don't know who the submitter is).
Non-double-blind submissions:

Contribute to Hero Culture: Hero culture is the tendency within technical


## cub_schmoo.cpp
/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the

## libcudacxx_force_include.h
// This is how we run libc++ tests on the GPU without modification.
// We force include this header into each test with `-include`.

__host__ __device__
int fake_main(int, char**);

__global__
void fake_main_kernel(int * ret)
{
    *ret = fake_main(0, NULL);

## thread_group_ranges.cpp
// I have this code:

struct thread_group {
private:
  std::vector<std::thread> members;

public:
  thread_group(thread_group const&) = delete;
  thread_group& operator=(thread_group const&) = delete;

## asynchronous_radix_sort.cpp
// Sort the sequence of integers by the Nth bit.
template <typename ExecutionPolicy,
          std::ranges::random_access_range InputRange, random_access_iterator OutputIt>
  requires std::integral<typename std::ranges_value_t<InputRange>>
unique_future<std::uint64_t> async_radix_sort_pass(ExecutionPolicy&& exec,
                                                   InputRange input, OutputRange output,
                                                   std::uint64_t bit)
{
  auto const elements = std::distance(input);

## radix_sort_naive_serial.cpp
template <typename InputIt, typename OutputIt>
OutputIt
radix_sort_split(InputIt first, InputIt last, OutputIt output, std::uint64_t bit)
{
  std::vector<std::uint64_t> e(std::distance(first, last));

  // Count 0s.
  std::transform(first, last, e.begin(),
                 [=] (auto t) { return !(t & (1 << bit)); });

## exclusive_scan.cpp
template <typename InputIterator, typename OutputIterator, typename T, typename BinaryOp>
OutputIterator exclusive_scan(InputIterator first, InputIterator last,
                              OutputIterator result, T init, BinaryOp op)
{
  if (first != last) {
    T saved = init;
    do {
      init = op(init, *first);
      *result = saved;
      saved = init;

## work_efficient_async_inclusive_scan.cpp
template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size>
unique_future<OutputIt>
async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size)
{
  Size const elements = std::distance(first, last);
  Size const chunks   = (1 + ((elements - 1) / chunk_size)); // Round up.

  std::vector<unique_future<T>> sweep;
  sweep.reserve(chunks);

## work_efficient_async_inclusive_scan.cpp
template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size>
unique_future<OutputIt>
async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size)
{
  Size const elements = std::distance(first, last);
  Size const chunks   = (1 + ((elements - 1) / chunk_size)); // Round up.

  std::vector<unique_future<T>> sweep;
  sweep.reserve(chunks);
	#include <cassert>

	int current_device()
	{
	int device = 0;
	cudaError_t const error = cudaGetDevice(&device);
	assert(cudaSuccess == error);
	return device;
	}
	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	// This is how we run libc++ tests on the GPU without modification.
	// We force include this header into each test with `-include`.

	__host__ __device__
	int fake_main(int, char**);

	__global__
	void fake_main_kernel(int * ret)
	{
	*ret = fake_main(0, NULL);
	// I have this code:

	struct thread_group {
	private:
	std::vector<std::thread> members;

	public:
	thread_group(thread_group const&) = delete;
	thread_group& operator=(thread_group const&) = delete;
	// Sort the sequence of integers by the Nth bit.
	template <typename ExecutionPolicy,
	std::ranges::random_access_range InputRange, random_access_iterator OutputIt>
	requires std::integral<typename std::ranges_value_t<InputRange>>
	unique_future<std::uint64_t> async_radix_sort_pass(ExecutionPolicy&& exec,
	InputRange input, OutputRange output,
	std::uint64_t bit)
	{
	auto const elements = std::distance(input);
	template <typename InputIt, typename OutputIt>
	OutputIt
	radix_sort_split(InputIt first, InputIt last, OutputIt output, std::uint64_t bit)
	{
	std::vector<std::uint64_t> e(std::distance(first, last));

	// Count 0s.
	std::transform(first, last, e.begin(),
	[=] (auto t) { return !(t & (1 << bit)); });
	template <typename InputIterator, typename OutputIterator, typename T, typename BinaryOp>
	OutputIterator exclusive_scan(InputIterator first, InputIterator last,
	OutputIterator result, T init, BinaryOp op)
	{
	if (first != last) {
	T saved = init;
	do {
	init = op(init, *first);
	*result = saved;
	saved = init;
	template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size>
	unique_future<OutputIt>
	async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size)
	{
	Size const elements = std::distance(first, last);
	Size const chunks = (1 + ((elements - 1) / chunk_size)); // Round up.

	std::vector<unique_future<T>> sweep;
	sweep.reserve(chunks);