Skip to content

Instantly share code, notes, and snippets.

View trevor-m's full-sized avatar

Trevor Morris trevor-m

View GitHub Profile
2023-06-05 17:37:07.425028: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu
2023-06-05 17:37:15.664716: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu
2023-06-05 17:37:15.664797: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu
2023-06-05 17:37:15.664863: W tensorflow/stream_executor/platform/de
@trevor-m
trevor-m / paxmlsegfault_logginenabled.txt
Created June 2, 2023 22:36
PAXML + PJRT Segfault with LOGGING_ENABLED
2023-06-02 22:27:23.476734: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu
2023-06-02 22:27:30.192610: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu
2023-06-02 22:27:30.192664: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu
2023-06-02 22:27:30.192699: W tensorflow/stream_executor/platform/de
@trevor-m
trevor-m / Dockerfile
Last active June 5, 2023 22:43
PAXML + PJRT Container
# Build using `docker build -t pjrt .`
# Inside container, run using:
# 1 device
# CUDA_VISIBLE_DEVICES=0 JAX_PLATFORMS=iree_cuda python main.py --exp=tasks.lm.params.nvidia.NVIDIA1_3BPmap --job_log_dir=log_NVIDIA1_3BPmap
# 2 devices
# `CUDA_VISIBLE_DEVICES=0,1 JAX_PLATFORMS=iree_cuda mpirun --allow-run-as-root -np 2 python main.py --exp=tasks.lm.params.nvidia.NVIDIA1_3BPmap --job_log_dir=log_NVIDIA1_3BPmap --mode=eval --multiprocess_gpu`
FROM ghcr.io/nvidia/pax:nightly-2023-05-23
SHELL [ "/bin/bash", "-c" ]
ENV CUDA_SDK_DIR=/usr/local/cuda
@trevor-m
trevor-m / asan.txt
Created June 2, 2023 20:20
PAXML PJRT Segfault asan
AddressSanitizer:DEADLYSIGNAL
=================================================================
==54208==ERROR: AddressSanitizer: SEGV on unknown address 0x00000000d3c0 (pc 0x7f8464007a7c bp 0x00000000d3c0 sp 0x6310013ea1f0 T0)
==54208==The signal is caused by a READ memory access.
#0 0x7f8464007a7c in pthread_kill (/usr/lib/x86_64-linux-gnu/libc.so.6+0x96a7c) (BuildId: 69389d485a9793dbe873f0ea2c93e02efaa9aa3d)
#1 0x7f8463fb3475 in gsignal (/usr/lib/x86_64-linux-gnu/libc.so.6+0x42475) (BuildId: 69389d485a9793dbe873f0ea2c93e02efaa9aa3d)
#2 0x7f8463fb351f (/usr/lib/x86_64-linux-gnu/libc.so.6+0x4251f) (BuildId: 69389d485a9793dbe873f0ea2c93e02efaa9aa3d)
#3 0x7f8282aff8f6 in iree_hal_resource_set_free resource_set.c
#4 0x7f8282afb705 in iree_hal_deferred_command_buffer_destroy deferred_command_buffer.c
#5 0x7f828231da39 in iree_vm_ref_move ref.c
@trevor-m
trevor-m / paxmlsegfault.txt
Last active June 1, 2023 23:41
PJRT Paxml Segfault Backtrace
Thread 1 (Thread 0x7f53cee40000 (LWP 39384)):
#0 __pthread_kill_implementation (no_tid=0, signo=11, threadid=139997930061824) at ./nptl/pthread_kill.c:44
#1 __pthread_kill_internal (signo=11, threadid=139997930061824) at ./nptl/pthread_kill.c:78
#2 __GI___pthread_kill (threadid=139997930061824, signo=signo@entry=11) at ./nptl/pthread_kill.c:89
#3 0x00007f53cee83476 in __GI_raise (sig=11) at ../sysdeps/posix/raise.c:26
#4 <signal handler called>
#5 0x00007f534bf94891 in iree_hal_resource_release (any_resource=0x200000000) at external/iree_core/runtime/src/iree/hal/resource.h:89
#6 0x00007f534bf945ca in iree_hal_resource_set_release_blocks (set=0x555dd36faf40, preserve_set=false) at external/iree_core/runtime/src/iree/hal/utils/resource_set.c:66
#7 0x00007f534bf9454e in iree_hal_resource_set_free (set=0x555dd36faf40) at external/iree_core/runtime/src/iree/hal/utils/resource_set.c:105
#8 0x00007f534bf92d8e in iree_hal_deferred_command_buffer_destroy (base_command_buffer=0x555ddb005c40) at external/iree_
@trevor-m
trevor-m / compare.py
Created February 5, 2021 17:36
Compare cudnn and TRT with TVM
import tvm
import numpy as np
from tvm import relay
from tvm.contrib import graph_runtime
def compile_graph(use_trt=False):
x = relay.var("x", shape=(100, 2048, 33, 33), dtype="float32")
w0 = relay.var("w0", shape=(256, 2048, 3, 3), dtype="float32")
w1 = relay.var("w1", shape=(256, 256, 3, 3), dtype="float32")
w2 = relay.var("w2", shape=(256, 256, 3, 3), dtype="float32")
@trevor-m
trevor-m / cudnn.log
Last active January 7, 2021 23:09
Slow conv2d trt
==21743== NVPROF is profiling process 21743, command: python3 -m pytest ../tests/python/contrib/test_tensorrt.py -s -k test_slow
enabled targets: llvm -device=arm_cpu; nvptx; cuda; llvm
pytest marker:
============================= test session starts ==============================
platform linux -- Python 3.6.6, pytest-6.1.2, py-1.9.0, pluggy-0.13.1
rootdir: /data/neo-ai-tvm, configfile: pytest.ini
plugins: arraydiff-0.2, cov-2.10.1, openfiles-0.3.0, remotedata-0.3.2, doctestplus-0.1.3
collected 48 items / 47 deselected / 1 selected
../tests/python/contrib/test_tensorrt.py [21:29:43] /data/neo-ai-tvm/src/runtime/contrib/cudnn/conv_forward.cc:245: CUDNN Found 8 fwd algorithms, choosing CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
@trevor-m
trevor-m / repro_slow_conv2d.py
Created January 7, 2021 22:56
Simple network with conv2d to demonstrate how TRT doesnt consider a fast cudnn kernel. Run with nvprof --profile-from-start-off
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import ctypes
_cudart = ctypes.CDLL('libcudart.so')
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
input_shape = [100, 2048, 33, 33]
%1310 = /* ty=Tensor[(100, 2048, 33, 33), float32] */;
%1311 = nn.conv2d(%1310, meta[relay.Constant][139] /* ty=Tensor[(256, 2048, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]) /* ty=Tensor[(100, 256, 33, 33), float32] */;
%1312 = add(%1311, meta[relay.Constant][140] /* ty=Tensor[(1, 256, 1, 1), float32] */) /* ty=Tensor[(100, 256, 33, 33), float32] */;
%1313 = nn.relu(%1312) /* ty=Tensor[(100, 256, 33, 33), float32] */;
%1314 = nn.conv2d(%1313, meta[relay.Constant][141] /* ty=Tensor[(256, 256, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]) /* ty=Tensor[(100, 256, 33, 33), float32] */;
%1315 = add(%1314, meta[relay.Constant][142] /* ty=Tensor[(1, 256, 1, 1), float32] */) /* ty=Tensor[(100, 256, 33, 33), float32] */;
%1316 = nn.relu(%1315) /* ty=Tensor[(100, 256, 33, 33), float32] */;
%1317 = nn.conv2d(%1316, meta[relay.Constant][143] /* ty=Tensor[(256, 256, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]) /
import tvm
import tvm.relay.testing.tf as tf_testing
from tvm import relay
from tvm.relay.frontend.tensorflow_parser import TFParser
from tvm.relay.op.contrib import tensorrt
import numpy as np
# Usage:
# wget http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz
# tar xvf ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz