Marek Kolodziej mkolod

## multi_streaming_to_reduce_launch_latency.cu
#include <chrono>
#include <iostream>
#include <vector>
#include <thread>

__global__ void do_nothing(int time_us, int clock_rate) {
  clock_t start = clock64();
  clock_t end;
  for (;;) {
    end = clock64();

## redirect_streams_and_cuda_checks.cu
#include <csignal>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <unistd.h>
#include <limits.h>

#include <iostream>
#include <sstream>
#include <stdexcept>

## cleaning_up_imagenet_valset.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                mkolod
                / cleaning_up_imagenet_valset.md
            
            
              Last active
              August 26, 2020 14:05
            
          
    ImageNet validation set fix:

The training set is organized in directories, with each directory matching a class, e.g. "n01751748" matching "sea snake." However, the valset is a flat dir of JPEGs.
The ImageNet labels provided in the devkit for the validation set (ILSVRC2012_validation_ground_truth.txt) are not consistent with the ordering used by PyTorch/TF/Keras/MXNet/Caffe, etc. for pre-trained models.
For example, in the the above ground truth label file, "sea snake" is 490, but in PyTorch/TF, it's 65.

Proof:

ImageNet validation labels: https://gist.githubusercontent.com/aaronpolhamus/964a4411c0906315deb9f4a3723aac57/raw/aa66dd9dbf6b56649fa3fab83659b2acbf3cbfd1/map_clsloc.txt
DL framework labels: https://gist.githubusercontent.com/aaronpolhamus/964a4411c0906315deb9f4a3723aac57/raw/aa66dd9dbf6b56649fa3fab83659b2acbf3cbfd1/map_clsloc.txt


Untar the valset file, you will get a flat dir of JPEGs.
Pull in the unflattening script into the directory where the val images were unpa


## tracking_pytorch_memory_use.py
import torch.cuda
import torch.cuda.memory as cumem
import sys
import ctypes as C

GB = 1 << 30

def get_cuda_memory():
    handle = C.cdll.LoadLibrary("libcudart.so")
    free, total = C.c_long(), C.c_long()

## vimrc
" Use Vim settings, rather then Vi settings (much better!).
" This must be first, because it changes other options as a side effect.
set nocompatible

" TODO: this may not be in the correct place. It is intended to allow overriding <Leader>.
" source ~/.vimrc.before if it exists.
if filereadable(expand("~/.vimrc.before"))
  source ~/.vimrc.before
endif

## profile.py
'''
Memory profiling utilities
'''
import gc
import inspect
import linecache
import os.path
import sys
import time
import threading

## onnx_checker_printer.py
import onnx
import sys

name = sys.argv[1]
model = onnx.load(name)
onnx.checker.check_model(model)
print(onnx.helper.printable_graph(model.graph))

## build_your_own_cuda_docker_image.txt
RUN wget http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
RUN chmod +x cuda_10.2.89_440.33.01_linux.run && \
    ./cuda_10.2.89_440.33.01_linux.run --silent --toolkit

# After installing CUDA, you need to add the env vars below, otherwise you'll get CUDA runtime version/driver
# version issue, even if you're using nvidia-docker.

ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

## efficient_mod_pow2_using_mask.py
# precondition: d is power of 2
def mod_pow2(n, d):
    return ( n & (d-1) )

print(n, "mod" , d , "is", mod_pow2(6, 4))


## non_pageable_malloc.cpp
#include <iostream>
#include <sys/mman.h>

template<typename T>
T* pinned_malloc(size_t num_elem) {

    size_t num_bytes = num_elem * sizeof(T);
    T *ptr = static_cast<T *>(malloc(num_bytes));

    if (ptr == nullptr) {
	#include <chrono>
	#include <iostream>
	#include <vector>
	#include <thread>

	__global__ void do_nothing(int time_us, int clock_rate) {
	clock_t start = clock64();
	clock_t end;
	for (;;) {
	end = clock64();
	#include <csignal>
	#include <cstdlib>
	#include <fstream>
	#include <iostream>
	#include <unistd.h>
	#include <limits.h>

	#include <iostream>
	#include <sstream>
	#include <stdexcept>
	import torch.cuda
	import torch.cuda.memory as cumem
	import sys
	import ctypes as C

	GB = 1 << 30

	def get_cuda_memory():
	handle = C.cdll.LoadLibrary("libcudart.so")
	free, total = C.c_long(), C.c_long()
	" Use Vim settings, rather then Vi settings (much better!).
	" This must be first, because it changes other options as a side effect.
	set nocompatible

	" TODO: this may not be in the correct place. It is intended to allow overriding <Leader>.
	" source ~/.vimrc.before if it exists.
	if filereadable(expand("~/.vimrc.before"))
	source ~/.vimrc.before
	endif
	'''
	Memory profiling utilities
	'''
	import gc
	import inspect
	import linecache
	import os.path
	import sys
	import time
	import threading
	import onnx
	import sys

	name = sys.argv[1]
	model = onnx.load(name)
	onnx.checker.check_model(model)
	print(onnx.helper.printable_graph(model.graph))
	RUN wget http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
	RUN chmod +x cuda_10.2.89_440.33.01_linux.run && \
	./cuda_10.2.89_440.33.01_linux.run --silent --toolkit

	# After installing CUDA, you need to add the env vars below, otherwise you'll get CUDA runtime version/driver
	# version issue, even if you're using nvidia-docker.

	ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs
	ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
	# precondition: d is power of 2
	def mod_pow2(n, d):
	return ( n & (d-1) )

	print(n, "mod" , d , "is", mod_pow2(6, 4))
	#include <iostream>
	#include <sys/mman.h>

	template<typename T>
	T* pinned_malloc(size_t num_elem) {

	size_t num_bytes = num_elem * sizeof(T);
	T ptr = static_cast<T >(malloc(num_bytes));

	if (ptr == nullptr) {