Kellen Sunderland KellenSunderland

## top_5000_symbol_size_mxnet.txt
    FILE SIZE        VM SIZE
 --------------  --------------
  73.8%   279Mi  75.2%   279Mi    [section .nv_fatbin]
   7.5%  28.5Mi   4.9%  18.4Mi    [46244 Others]
   1.1%  4.03Mi   1.0%  3.58Mi    mshadow::MapExp<>()
   0.9%  3.56Mi   1.0%  3.56Mi    __sti____cudaRegisterAll()
   0.9%  3.37Mi   0.9%  3.36Mi    mxnet::op::ElemwiseBinaryOp::RspRspOp<>()
   0.0%  2.70Ki   0.7%  2.78Mi    precalc_xorwow_matrix
   0.0%  2.71Ki   0.7%  2.78Mi    precalc_xorwow_offset_matrix
   0.7%  2.78Mi   0.7%  2.78Mi    [section .gcc_except_table]

## TensorRTCompilation.MD

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                KellenSunderland
                / TensorRTCompilation.MD
            
            
              Last active
              November 16, 2019 00:30
            
              
                MXNet with TensorRT support compilation guide
              
          
    FROM UBUNTU 18.04
sudo apt-get install tmux htop libssl-dev
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.0.130-1_amd64.deb
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && sudo apt update
sudo dpkg -i cuda-repo-ubuntu1804_10.0.130-1_amd64.deb
sudo apt update
sudo apt install -y cuda
wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
sudo dpkg -i nvidia-machine-learning-repo-*.deb

  
## reproduce.sh
#!/bin/bash

wget https://gist.githubusercontent.com/KellenSunderland/219fc25d7f796ae5442cfe162d27cc9c/raw/d9015ebd878391930b5ba186ab4ba77042eee943/Dockerfile
nvidia-docker build -f Dockerfile . -t mxnet/regression
nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.4.0-cmake/python" mxnet/regression python2 run_model.py
nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.3.0-cmake/python" mxnet/regression python2 run_model.py
nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.2.0-cmake/python" mxnet/regression python2 run_model.py
nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.4.0-cmake-profiler/python" mxnet/regression python2 run_model.py
nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.4.0-make/python" mxnet/regression python2 run_model.py

## Dockerfile
FROM kellens/build.ubuntu_gpu_tensorrt:latest

WORKDIR /work

RUN wget https://gist.githubusercontent.com/KellenSunderland/686522830475dfc7073b5d7a97e89d24/raw/a0b12e63b5fbf51f4c2794a9a8dae22a2ac8cab1/run_model.py && \
    wget https://raw.githubusercontent.com/apache/incubator-mxnet/master/example/fcn-xs/symbol_fcnxs.py

RUN git clone --recursive https://github.com/apache/incubator-mxnet.git --branch v1.4.x mxnet-v1.4.0-cmake && \
    mkdir -p /work/mxnet-v1.4.0-cmake/lib/ && \
    git clone --recursive https://github.com/apache/incubator-mxnet.git --branch v1.4.x mxnet-v1.4.0-make && \

## config.mk
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#

## run_model.py
import mxnet as mx
import numpy as np
import importlib
from collections import namedtuple
import time

def runMx(ctx,mod,data,num_batches,runType):
    print('%s MXNet' % (runType))
    Batch = namedtuple('Batch', ['data'])
    t = 0

## main.cpp
#include <iostream>
#include <utility>
#include <vector>
#include <cstring>


using namespace std;

class MyType {
public:

## bt all from hung test
Thread 20 (Thread 0x7f43ab149700 (LWP 95971)):
#0  pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1  0x00007f4450b1a91c in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from target:/usr/lib/x86_64-linux-gnu/libstdc++.so.6
#2  0x00007f43f7a2463f in std::condition_variable::wait<mxnet::engine::ThreadedEngine::WaitForVar(mxnet::Engine::VarHandle)::<lambda()> > (__p=..., __lock=..., this=0x30d6238)
    at /usr/include/c++/5/condition_variable:98
#3  mxnet::engine::ThreadedEngine::WaitForVar (this=<optimized out>, var=0x3d6c318) at src/engine/threaded_engine.cc:387
#4  0x00007f43f922f411 in mxnet::op::CuDNNConvolutionOp<float>::SelectAlgo (this=this@entry=0x7f4328b6bdd0, ctx=..., in_shape=..., out_shape=...,
    cudnn_forward_compute_type=cudnn_forward_compute_type@entry=CUDNN_DATA_FLOAT, cudnn_backward_compute_type=cudnn_backward_compute_type@entry=CUDNN_DATA_FLOAT)
    at src/operator/nn/./cudnn/cudnn_convolution-inl.h:718
#5  0x00007f43f91efa10

## hang
warning: Target and debugger are in different PID namespaces; thread lists and other data are likely unreliable
0x00007f44580af98d in pthread_join (threadid=139928609789696, thread_return=0x0) at pthread_join.c:90
90      pthread_join.c: No such file or directory.
(gdb) bt
#0  0x00007f44580af98d in pthread_join (threadid=139928609789696, thread_return=0x0) at pthread_join.c:90
#1  0x00007f4450b1fb97 in std::thread::join() () from target:/usr/lib/x86_64-linux-gnu/libstdc++.so.6
#2  0x00007f43f7a2f997 in mxnet::engine::ThreadPool::~ThreadPool (this=0x1af15540, __in_chrg=<optimized out>) at src/engine/./thread_pool.h:84
#3  std::default_delete<mxnet::engine::ThreadPool>::operator() (this=<optimized out>, __ptr=0x1af15540) at /usr/include/c++/5/bits/unique_ptr.h:76
#4  std::unique_ptr<mxnet::engine::ThreadPool, std::default_delete<mxnet::engine::ThreadPool> >::~unique_ptr (this=0x19516fe8, __in_chrg=<optimized out>)
    at /usr/include/c++/5/bits/unique_ptr.h:236

## device_props.cu
#include <stdio.h>

int main() {
  int nDevices;

  cudaGetDeviceCount(&nDevices);
  for (int i = 0; i < nDevices; i++) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    printf("Device Number: %d\n", i);
	FILE SIZE VM SIZE
	-------------- --------------
	73.8% 279Mi 75.2% 279Mi [section .nv_fatbin]
	7.5% 28.5Mi 4.9% 18.4Mi [46244 Others]
	1.1% 4.03Mi 1.0% 3.58Mi mshadow::MapExp<>()
	0.9% 3.56Mi 1.0% 3.56Mi __sti____cudaRegisterAll()
	0.9% 3.37Mi 0.9% 3.36Mi mxnet::op::ElemwiseBinaryOp::RspRspOp<>()
	0.0% 2.70Ki 0.7% 2.78Mi precalc_xorwow_matrix
	0.0% 2.71Ki 0.7% 2.78Mi precalc_xorwow_offset_matrix
	0.7% 2.78Mi 0.7% 2.78Mi [section .gcc_except_table]
	#!/bin/bash

	wget https://gist.githubusercontent.com/KellenSunderland/219fc25d7f796ae5442cfe162d27cc9c/raw/d9015ebd878391930b5ba186ab4ba77042eee943/Dockerfile
	nvidia-docker build -f Dockerfile . -t mxnet/regression
	nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.4.0-cmake/python" mxnet/regression python2 run_model.py
	nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.3.0-cmake/python" mxnet/regression python2 run_model.py
	nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.2.0-cmake/python" mxnet/regression python2 run_model.py
	nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.4.0-cmake-profiler/python" mxnet/regression python2 run_model.py
	nvidia-docker run -ti -e "PYTHONPATH=/work/mxnet-v1.4.0-make/python" mxnet/regression python2 run_model.py
	FROM kellens/build.ubuntu_gpu_tensorrt:latest

	WORKDIR /work

	RUN wget https://gist.githubusercontent.com/KellenSunderland/686522830475dfc7073b5d7a97e89d24/raw/a0b12e63b5fbf51f4c2794a9a8dae22a2ac8cab1/run_model.py && \
	wget https://raw.githubusercontent.com/apache/incubator-mxnet/master/example/fcn-xs/symbol_fcnxs.py

	RUN git clone --recursive https://github.com/apache/incubator-mxnet.git --branch v1.4.x mxnet-v1.4.0-cmake && \
	mkdir -p /work/mxnet-v1.4.0-cmake/lib/ && \
	git clone --recursive https://github.com/apache/incubator-mxnet.git --branch v1.4.x mxnet-v1.4.0-make && \
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	import mxnet as mx
	import numpy as np
	import importlib
	from collections import namedtuple
	import time

	def runMx(ctx,mod,data,num_batches,runType):
	print('%s MXNet' % (runType))
	Batch = namedtuple('Batch', ['data'])
	t = 0
	#include <iostream>
	#include <utility>
	#include <vector>
	#include <cstring>


	using namespace std;

	class MyType {
	public:
	Thread 20 (Thread 0x7f43ab149700 (LWP 95971)):
	#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
	#1 0x00007f4450b1a91c in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from target:/usr/lib/x86_64-linux-gnu/libstdc++.so.6
	#2 0x00007f43f7a2463f in std::condition_variable::wait<mxnet::engine::ThreadedEngine::WaitForVar(mxnet::Engine::VarHandle)::<lambda()> > (__p=..., __lock=..., this=0x30d6238)
	at /usr/include/c++/5/condition_variable:98
	#3 mxnet::engine::ThreadedEngine::WaitForVar (this=<optimized out>, var=0x3d6c318) at src/engine/threaded_engine.cc:387
	#4 0x00007f43f922f411 in mxnet::op::CuDNNConvolutionOp<float>::SelectAlgo (this=this@entry=0x7f4328b6bdd0, ctx=..., in_shape=..., out_shape=...,
	cudnn_forward_compute_type=cudnn_forward_compute_type@entry=CUDNN_DATA_FLOAT, cudnn_backward_compute_type=cudnn_backward_compute_type@entry=CUDNN_DATA_FLOAT)
	at src/operator/nn/./cudnn/cudnn_convolution-inl.h:718
	#5 0x00007f43f91efa10
	warning: Target and debugger are in different PID namespaces; thread lists and other data are likely unreliable
	0x00007f44580af98d in pthread_join (threadid=139928609789696, thread_return=0x0) at pthread_join.c:90
	90 pthread_join.c: No such file or directory.
	(gdb) bt
	#0 0x00007f44580af98d in pthread_join (threadid=139928609789696, thread_return=0x0) at pthread_join.c:90
	#1 0x00007f4450b1fb97 in std::thread::join() () from target:/usr/lib/x86_64-linux-gnu/libstdc++.so.6
	#2 0x00007f43f7a2f997 in mxnet::engine::ThreadPool::~ThreadPool (this=0x1af15540, __in_chrg=<optimized out>) at src/engine/./thread_pool.h:84
	#3 std::default_delete<mxnet::engine::ThreadPool>::operator() (this=<optimized out>, __ptr=0x1af15540) at /usr/include/c++/5/bits/unique_ptr.h:76
	#4 std::unique_ptr<mxnet::engine::ThreadPool, std::default_delete<mxnet::engine::ThreadPool> >::~unique_ptr (this=0x19516fe8, __in_chrg=<optimized out>)
	at /usr/include/c++/5/bits/unique_ptr.h:236
	#include <stdio.h>

	int main() {
	int nDevices;

	cudaGetDeviceCount(&nDevices);
	for (int i = 0; i < nDevices; i++) {
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, i);
	printf("Device Number: %d\n", i);