Kai Londenberg kadeng

## test.cpp
#include <iostream>
#include <sstream>
#include <cmath>
#include <typeinfo>


#define TRACEABLE_DECLARATION static constexpr int __line__ = __LINE__; static constexpr const char *__file__ = __FILE__;
#ifdef __GNUG__
#include <cxxabi.h>
#include <memory>

## repro.cu

#include <cuda.h>
#include <iostream>

#define ASSERT_EQ(a,b) if (a!=b) { std::cerr << "Error" << std::endl << " Last CUDA error: " << cudaGetErrorName(cudaPeekAtLastError()) << ": " << cudaGetErrorString(cudaPeekAtLastError()) << std::endl; exit(1); }


__global__ void set_array_value(float *data, size_t num_elements, float value) {
	    int idx = blockIdx.x*blockDim.x + threadIdx.x;
	    if (idx<num_elements) {

## test.py
 def test_wrapper_codegen_statically_known_int_or_none(self) -> typing.List[CachingAutotuner]:
            from torch._dynamo.utils import detect_fake_mode
            from torch._inductor.codegen.common import boolean_ops
            from torch._inductor.codegen.wrapper import WrapperCodeGen
            from torch._inductor.compile_fx import _shape_env_from_inputs
            from torch._inductor.debug import DebugContext
            from torch._inductor.graph import GraphLowering
            from torch._inductor.virtualized import V
            from torch.fx.passes.fake_tensor_prop import FakeTensorProp

## repro.sh
#!/bin/bash
# Change the environment variables to point to Cutlass and CUDA Toolkit and run this,
# passing any of the standalone repro_N.cu files as argument. It will compile and run the
# example.

set -x
export REPRO_CUTLASS_PATH=/home/klondenberg/github/pytorch/pytorch/third_party/cutlass
export REPRO_CUDA_PATH=/home/klondenberg/local/cuda121
$REPRO_CUDA_PATH/bin/nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -Xcompiler=-fPIC --use_fast_math -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I${REPRO_CUTLASS_PATH}/include -I${REPRO_CUTLASS_PATH}/tools/library/include -I${REPRO_CUTLASS_PATH}/tools/library/src -I${REPRO_CUTLASS_PATH}/tools/util/include -L${REPRO_CUDA_PATH}/lib64 -L${REPRO_CUDA_PATH}/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -DNDEBUG -DCUTLASS_DEBUG_TRACE_LEVEL=1 -o "${@}.exe" "$@"
"./${@}.exe"

## performance_repro.cu
Environment:

 * Linux x64, NVIDIA H100 GPU
 * CUDA 12.1
 * Cutlass v3.3.0 ( tagged release ) and Cutlass v3.2.2 ( tagged release )

Command ( example ):

nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -Xcompiler=-fPIC --use-fast-math -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/src -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/util/include -L/home/klondenberg/local/cuda121/lib64 -L/home/klondenberg/local/cuda121/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -o performance_repro performance_repro.cu

## Build instructions
Environment:

 * Linux x64, NVIDIA H100 GPU
 * CUDA 12.1
 * Cutlass v3.3.0 ( tagged release )

Command ( example ):

nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -lineinfo -g -DCUTLASS_DEBUG_TRACE_LEVEL=1 -Xcompiler=-fPIC -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/src -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/util/include -L/home/klondenberg/local/cuda121/lib64 -L/home/klondenberg/local/cuda121/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -o broken6 broken6.cu

## Error trace
Out-of-range shared or local address
=========     at 0xbd0 in /home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/arch/barrier.h:169:cutlass::arch::ClusterBarrier::init(const unsigned long *, unsigned int)
=========     by thread (0,0,0) in block (0,1,0)
=========     Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/arch/barrier.h:127:cutlass::arch::ClusterBarrier::init(unsigned int) const [0xb20]
=========     Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp:1073:cutlass::OrderedSequenceBarrier<(int)1, (int)2>::OrderedSequenceBarrier(cutlass::OrderedSequenceBarrier<(int)1, (int)2>::SharedStorage &, const cutlass::OrderedSequenceBarrier<(int)1, (int)2>::Params &) [0xb20]
=========     Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp:382:cutlass::gemm::kernel::GemmUniversal<cute::tup

## AndroidManifest.xml
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android" package="com.myapp">

    <application android:allowBackup="true" android:icon="@mipmap/ic_launcher android:label="@string/app_name"
        android:roundIcon="@mipmap/ic_launcher_round" android:supportsRtl="true" android:theme="@style/AppTheme">

        <service android:name=".ForegroundService" android:enabled="true" android:exported="true"></service>

        <activity
            android:configChanges="orientation|keyboardHidden|keyboard|screenSize|locale"

## tensorflow_report_tensor_allocations_upon_oom.py
# Working Example to use report_tensor_allocations_upon_oom
# with recent Tensorflow and Keras
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config = config)
    K.set_session(sess)

    run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
    run_metadata = tf.RunMetadata()
    # ...

## !README.md

      
              9 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                kadeng
                / !README.md
            
            
              Created
              September 13, 2018 07:19
                — forked from zed/!README.md
            
              
                swig hello world example
              
          
    see python module compiled by SWIG returns wrong result
	#include <iostream>
	#include <sstream>
	#include <cmath>
	#include <typeinfo>


	#define TRACEABLE_DECLARATION static constexpr int __line__ = __LINE__; static constexpr const char *__file__ = __FILE__;
	#ifdef __GNUG__
	#include <cxxabi.h>
	#include <memory>

	#include <cuda.h>
	#include <iostream>

	#define ASSERT_EQ(a,b) if (a!=b) { std::cerr << "Error" << std::endl << " Last CUDA error: " << cudaGetErrorName(cudaPeekAtLastError()) << ": " << cudaGetErrorString(cudaPeekAtLastError()) << std::endl; exit(1); }


	__global__ void set_array_value(float *data, size_t num_elements, float value) {
	int idx = blockIdx.x*blockDim.x + threadIdx.x;
	if (idx<num_elements) {
	def test_wrapper_codegen_statically_known_int_or_none(self) -> typing.List[CachingAutotuner]:
	from torch._dynamo.utils import detect_fake_mode
	from torch._inductor.codegen.common import boolean_ops
	from torch._inductor.codegen.wrapper import WrapperCodeGen
	from torch._inductor.compile_fx import _shape_env_from_inputs
	from torch._inductor.debug import DebugContext
	from torch._inductor.graph import GraphLowering
	from torch._inductor.virtualized import V
	from torch.fx.passes.fake_tensor_prop import FakeTensorProp
	#!/bin/bash
	# Change the environment variables to point to Cutlass and CUDA Toolkit and run this,
	# passing any of the standalone repro_N.cu files as argument. It will compile and run the
	# example.

	set -x
	export REPRO_CUTLASS_PATH=/home/klondenberg/github/pytorch/pytorch/third_party/cutlass
	export REPRO_CUDA_PATH=/home/klondenberg/local/cuda121
	$REPRO_CUDA_PATH/bin/nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -Xcompiler=-fPIC --use_fast_math -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I${REPRO_CUTLASS_PATH}/include -I${REPRO_CUTLASS_PATH}/tools/library/include -I${REPRO_CUTLASS_PATH}/tools/library/src -I${REPRO_CUTLASS_PATH}/tools/util/include -L${REPRO_CUDA_PATH}/lib64 -L${REPRO_CUDA_PATH}/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -DNDEBUG -DCUTLASS_DEBUG_TRACE_LEVEL=1 -o "${@}.exe" "$@"
	"./${@}.exe"
	Environment:

	* Linux x64, NVIDIA H100 GPU
	* CUDA 12.1
	* Cutlass v3.3.0 ( tagged release ) and Cutlass v3.2.2 ( tagged release )

	Command ( example ):

	nvcc -t=0 -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -w -gencode=arch=compute_90a,code=[sm_90a,compute_90a] -O1 -std=c++17 --expt-relaxed-constexpr -Xcompiler=-fPIC --use-fast-math -Xcompiler=-fno-strict-aliasing -Xcompiler -fvisibility=hidden -Xcompiler=-Wconversion -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/include -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/library/src -I/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/tools/util/include -L/home/klondenberg/local/cuda121/lib64 -L/home/klondenberg/local/cuda121/lib64/stubs -lcuda -lcudart -DGENERATE_STANDALONE_RUNNER -o performance_repro performance_repro.cu
	Out-of-range shared or local address
	========= at 0xbd0 in /home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/arch/barrier.h:169:cutlass::arch::ClusterBarrier::init(const unsigned long *, unsigned int)
	========= by thread (0,0,0) in block (0,1,0)
	========= Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/arch/barrier.h:127:cutlass::arch::ClusterBarrier::init(unsigned int) const [0xb20]
	========= Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp:1073:cutlass::OrderedSequenceBarrier<(int)1, (int)2>::OrderedSequenceBarrier(cutlass::OrderedSequenceBarrier<(int)1, (int)2>::SharedStorage &, const cutlass::OrderedSequenceBarrier<(int)1, (int)2>::Params &) [0xb20]
	========= Device Frame:/home/klondenberg/github/pytorch/pytorch/third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp:382:cutlass::gemm::kernel::GemmUniversal<cute::tup
	<?xml version="1.0" encoding="utf-8"?>
	<manifest xmlns:android="http://schemas.android.com/apk/res/android" package="com.myapp">

	<application android:allowBackup="true" android:icon="@mipmap/ic_launcher android:label="@string/app_name"
	android:roundIcon="@mipmap/ic_launcher_round" android:supportsRtl="true" android:theme="@style/AppTheme">

	<service android:name=".ForegroundService" android:enabled="true" android:exported="true"></service>

	<activity
	android:configChanges="orientation\|keyboardHidden\|keyboard\|screenSize\|locale"
	# Working Example to use report_tensor_allocations_upon_oom
	# with recent Tensorflow and Keras
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	sess = tf.Session(config = config)
	K.set_session(sess)

	run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
	run_metadata = tf.RunMetadata()
	# ...