Skip to content

Instantly share code, notes, and snippets.

@akors
Created December 14, 2017 15:02
Show Gist options
  • Save akors/7cbfd7f60796397cec4dcf6c8f0a84bb to your computer and use it in GitHub Desktop.
Save akors/7cbfd7f60796397cec4dcf6c8f0a84bb to your computer and use it in GitHub Desktop.
CUDA device linking with shared objects - cudart vs cudart_static
#!/bin/sh
# kernel.a
nvcc -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -x cu -dc kernel.cu -o kernel.cu.o
ar qc libkernel.a kernel.cu.o
ranlib libkernel.a
# kernel2.a
nvcc -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -x cu -dc kernel2.cu -o kernel2.cu.o
ar qc libkernel2.a kernel2.cu.o
ranlib libkernel2.a
# allkernels.so
nvcc -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -x cu -dc empty.cu -o empty.cu.o
nvcc --cudart shared -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -Wno-deprecated-gpu-targets -shared -dlink empty.cu.o -o allkernels_device_link.o libkernel.a libkernel2.a
g++ -fPIC -shared -Wl,-soname,liballkernels.so -o liballkernels.so empty.cu.o allkernels_device_link.o libkernel.a libkernel2.a -L"/usr/local/cuda/lib64/stubs" -L"/usr/local/cuda/lib64" -lcudadevrt -lcudart
# rdctest
g++ -fPIE -o main.cpp.o -c main.cpp
nvcc --cudart shared -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -Wno-deprecated-gpu-targets -shared -dlink main.cpp.o -o main_device_link.o -L/usr/local/cuda/lib64/stubs -L/usr/local/cuda/lib64 libkernel.a libkernel2.a
g++ main.cpp.o main_device_link.o -o rdctest -L/usr/local/cuda/lib64/stubs -L/usr/local/cuda/lib64 -Wl,-rpath,. liballkernels.so libkernel.a libkernel2.a -lcudadevrt -lcudart
#!/bin/sh
# kernel.a
nvcc -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -x cu -dc kernel.cu -o kernel.cu.o
ar qc libkernel.a kernel.cu.o
ranlib libkernel.a
# kernel2.a
nvcc -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -x cu -dc kernel2.cu -o kernel2.cu.o
ar qc libkernel2.a kernel2.cu.o
ranlib libkernel2.a
# allkernels.so
nvcc -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -x cu -dc empty.cu -o empty.cu.o
nvcc --cudart shared -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -Wno-deprecated-gpu-targets -shared -dlink empty.cu.o -o allkernels_device_link.o libkernel.a libkernel2.a
g++ -fPIC -shared -Wl,-soname,liballkernels.so -o liballkernels.so empty.cu.o allkernels_device_link.o libkernel.a libkernel2.a -L"/usr/local/cuda/lib64/stubs" -L"/usr/local/cuda/lib64" -lcudadevrt -lcudart_static -lrt -lpthread -ldl
# rdctest
g++ -fPIE -o main.cpp.o -c main.cpp
nvcc --cudart shared -gencode arch=compute_61,code=compute_61 -Xcompiler=-fPIC -Wno-deprecated-gpu-targets -shared -dlink main.cpp.o -o main_device_link.o -L/usr/local/cuda/lib64/stubs -L/usr/local/cuda/lib64 libkernel.a libkernel2.a
g++ main.cpp.o main_device_link.o -o rdctest -L/usr/local/cuda/lib64/stubs -L/usr/local/cuda/lib64 -Wl,-rpath,. liballkernels.so libkernel.a libkernel2.a -lcudadevrt -lcudart_static -lrt -lpthread -ldl
cmake_minimum_required(VERSION 3.7)
project (CudaSharedThrust CXX CUDA)
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_61,code=compute_61")
if(BUILD_SHARED_LIBS)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()
add_library(kernel STATIC kernel.cu)
set_target_properties(kernel PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(kernel2 STATIC kernel2.cu)
set_target_properties(kernel2 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(allkernels empty.cu) # empty.cu is an empty file
set_target_properties(allkernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(allkernels kernel kernel2)
add_executable(rdctest main.cpp)
set_target_properties(rdctest PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(rdctest allkernels)
#include "kernel.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel(int *data){
if (threadIdx.x == 0)
printf("the kernel says hello\n");
data[threadIdx.x] = threadIdx.x * 2;
}
void Kernel::callKernel(){
thrust::device_vector<int> D2;
D2.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D2[0]);
printf("Kernel::callKernel called\n");
thekernel <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel!" << std::endl;
}
for (int i = 0; i < D2.size(); i++)
std::cout << "Kernel D[" << i << "]=" << D2[i] << std::endl;
}
#ifndef __KERNEL_CUH__
#define __KERNEL_CUH__
class Kernel{
public:
void callKernel();
};
#endif
#include "kernel2.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel2(int *data2){
if (threadIdx.x == 0)
printf("the kernel2 says hello\n");
data2[threadIdx.x] = threadIdx.x * 2;
}
void Kernel2::callKernel2(){
thrust::device_vector<int> D;
D.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D[0]);
printf("Kernel2::callKernel2 called\n");
thekernel2 <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel2!" << std::endl;
}
for (int i = 0; i < D.size(); i++)
std::cout << "Kernel2 D[" << i << "]=" << D[i] << std::endl;
}
#ifndef __KERNEL2_CUH__
#define __KERNEL2_CUH__
class Kernel2{
public:
void callKernel2();
};
#endif
#include "kernel.cuh"
#include "kernel2.cuh"
int main(){
Kernel k;
k.callKernel();
Kernel2 k2;
k2.callKernel2();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment