Skip to content

Instantly share code, notes, and snippets.

@roastduck
Created March 25, 2019 12:34
Show Gist options
  • Save roastduck/1b15bddd4682be48e64a457053d86387 to your computer and use it in GitHub Desktop.
Save roastduck/1b15bddd4682be48e64a457053d86387 to your computer and use it in GitHub Desktop.
cuDNN logger using LD_PRELOAD
// compile: g++ -O2 -g -Wall -fPIC -shared hook.cpp -o hook.so -ldl -L/usr/local/cuda-10.0/extras/cudnn-7.4.2/lib64 -lcudnn -lcublas -lcudart
#include <cassert>
#include <mutex>
#include <string>
#include <fstream>
#include <iostream>
#include <type_traits>
#include <unordered_map>
#include <dlfcn.h>
// User headers
#include <cuda.h>
#include <cublas.h>
#include <cudnn.h>
namespace
{
std::mutex lock;
std::ofstream os("result.txt");
std::unordered_map<std::string, void*> dict;
template <class T>
void logOne(std::ostream &os, const T &arg)
{
os << arg;
}
template <>
void logOne(std::ostream &os, const cudaMemcpyKind &arg)
{
switch (arg)
{
case cudaMemcpyHostToHost: os << "H->H"; break;
case cudaMemcpyHostToDevice: os << "H->D"; break;
case cudaMemcpyDeviceToHost: os << "D->H"; break;
case cudaMemcpyDeviceToDevice: os << "D->D"; break;
default: assert(false);
}
}
template <>
void logOne(std::ostream &os, const cudnnTensorDescriptor_t &arg)
{
int n, dims[10], strides[10];
cudnnDataType_t type;
auto ret = cudnnGetTensorNdDescriptor(arg, 10, &type, &n, dims, strides);
assert(ret == CUDNN_STATUS_SUCCESS);
switch (type)
{
case CUDNN_DATA_FLOAT: os << "FLOAT"; break;
case CUDNN_DATA_DOUBLE: os << "DOUBLE"; break;
case CUDNN_DATA_HALF: os << "HALF"; break;
case CUDNN_DATA_INT8: os << "INT8"; break;
case CUDNN_DATA_INT32: os << "INT32"; break;
case CUDNN_DATA_INT8x4: os << "INT8x4"; break;
default: assert(false);
}
for (int i = 0; i < n; i++)
os << (i ? "," : " d=[") << dims[i];
for (int i = 0; i < n; i++)
os << (i ? "," : "] s=[") << strides[i];
os << "]";
}
void logMany(std::ostream &os)
{
// Specialization for no args. Do nothing
}
template <class T, class ...Args>
void logMany(std::ostream &os, T &&first, Args &&...args)
{
if (sizeof...(args) == 0)
logOne(os, first);
else
{
logOne(os, first);
os << ", ";
logMany(os, args...);
}
}
template <class ...Args>
void log(std::ostream &os, const std::string &name, Args &&...args)
{
os << name << "(";
logMany(os, args...);
os << ")";
}
template <class Func, class ...Args>
typename std::result_of<Func(Args...)>::type proxy(const std::string &name, Args &&...args)
{
std::lock_guard<std::mutex> guard(lock);
log(os, name, args...);
Func func = nullptr;
if (!dict.count(name))
{
func = (Func)dlsym(RTLD_NEXT, name.c_str());
if (!func)
{
std::cerr << "[ERROR] Unable to load " << name << " : ";
auto errstr = dlerror();
if (errstr)
std::cerr << errstr;
else
std::cerr << "No error";
std::cerr << std::endl;
exit(1);
}
dict[name] = (void*)func;
} else
func = (Func)(dict.at(name));
if (std::is_same<typename std::result_of<Func(Args...)>::type, void>::value)
{
func(args...);
os << std::endl;
} else
{
auto &&ret = func(args...);
os << " -> ";
logOne(os, ret);
os << std::endl;
return ret;
}
}
} // Anonymous namespace
#define PROXY(func, ...) proxy<decltype(func)*>(#func, __VA_ARGS__)
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
{
return PROXY(cudaMemcpy, dst, src, count, kind);
}
cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream)
{
return PROXY(cudaMemcpyAsync, dst, src, count, kind, stream);
}
cublasStatus_t
cublasCreate(cublasHandle_t *handle)
{
return PROXY(cublasCreate, handle);
}
cudnnStatus_t cudnnCreate(cudnnHandle_t *handle)
{
return PROXY(cudnnCreate, handle);
}
cudnnStatus_t
cudnnActivationBackward(cudnnHandle_t handle,
cudnnActivationDescriptor_t activationDesc,
const void *alpha,
const cudnnTensorDescriptor_t yDesc,
const void *y,
const cudnnTensorDescriptor_t dyDesc,
const void *dy,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const void *beta,
const cudnnTensorDescriptor_t dxDesc,
void *dx)
{
return PROXY(cudnnActivationBackward, handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
}
cudnnStatus_t
cudnnActivationForward(cudnnHandle_t handle,
cudnnActivationDescriptor_t activationDesc,
const void *alpha,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const void *beta,
const cudnnTensorDescriptor_t yDesc,
void *y)
{
return PROXY(cudnnActivationForward, handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
}
cudnnStatus_t
cudnnAddTensor(cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t aDesc,
const void *A,
const void *beta,
const cudnnTensorDescriptor_t cDesc,
void *C)
{
return PROXY(cudnnAddTensor, handle, alpha, aDesc, A, beta, cDesc, C);
}
cudnnStatus_t
cudnnConvolutionBackwardBias(cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t dyDesc,
const void *dy,
const void *beta,
const cudnnTensorDescriptor_t dbDesc,
void *db)
{
return PROXY(cudnnConvolutionBackwardBias, handle, alpha, dyDesc, dy, beta, dbDesc, db);
}
cudnnStatus_t
cudnnConvolutionBackwardData(cudnnHandle_t handle,
const void *alpha,
const cudnnFilterDescriptor_t wDesc,
const void *w,
const cudnnTensorDescriptor_t dyDesc,
const void *dy,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdDataAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t dxDesc,
void *dx)
{
return PROXY(cudnnConvolutionBackwardData, handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
}
cudnnStatus_t
cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const cudnnTensorDescriptor_t dyDesc,
const void *dy,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdFilterAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnFilterDescriptor_t dwDesc,
void *dw)
{
return PROXY(cudnnConvolutionBackwardFilter, handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
}
cudnnStatus_t
cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
const void *alpha1,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const cudnnFilterDescriptor_t wDesc,
const void *w,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *alpha2,
const cudnnTensorDescriptor_t zDesc,
const void *z,
const cudnnTensorDescriptor_t biasDesc,
const void *bias,
const cudnnActivationDescriptor_t activationDesc,
const cudnnTensorDescriptor_t yDesc,
void *y)
{
return PROXY(cudnnConvolutionBiasActivationForward,
handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2,
zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
}
cudnnStatus_t
cudnnConvolutionForward(cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const cudnnFilterDescriptor_t wDesc,
const void *w,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t yDesc,
void *y)
{
return PROXY(cudnnConvolutionForward,
handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
}
cudnnStatus_t
cudnnOpTensor(cudnnHandle_t handle,
const cudnnOpTensorDescriptor_t opTensorDesc,
const void *alpha1,
const cudnnTensorDescriptor_t aDesc,
const void *A,
const void *alpha2,
const cudnnTensorDescriptor_t bDesc,
const void *B,
const void *beta,
const cudnnTensorDescriptor_t cDesc,
void *C)
{
return PROXY(cudnnOpTensor, handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
}
cudnnStatus_t
cudnnPoolingBackward(cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t yDesc,
const void *y,
const cudnnTensorDescriptor_t dyDesc,
const void *dy,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const void *beta,
const cudnnTensorDescriptor_t dxDesc,
void *dx)
{
return PROXY(cudnnPoolingBackward, handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc,
x, beta, dxDesc, dx);
}
cudnnStatus_t
cudnnPoolingForward(cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const void *beta,
const cudnnTensorDescriptor_t yDesc,
void *y)
{
return PROXY(cudnnPoolingForward, handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment