Jonathan Raiman JonathanRaiman

## some_file.cu
/*
Reduction over all dimensions in mshadow. Requires changing the structs in
mshadow expression to store their input expressions by value instead of
reference.

Installation:

nvcc some_file.cu -std=c++11 -O3 -w -o some_file -I /usr/local/include

Usage:

## array.cu
#include <vector>
#include <string>
#include <memory>
#include <sstream>
#include <iostream>

#define XINLINE __device__ __host__
#define MAX_DIM 10
#define INDENT_INCREMENT 2

## awesome_scan.py
def listify(x):
    if isinstance(x, tuple):
        return list(x)
    return x

def awesome_scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         swap_memory=False, name=None):
  """scan on the list of tensors unpacked from `elems` on dimension 0.
  This scan operator repeatedly applies the callable `fn` to a sequence
  of elements from first to last. The elements are made of the tensors

## gemm_parallel.cpp
/*
Comparing explicit BLAS parallelism using a thread pool
with vendor-implemented parallelism.

Program prints the runtime averaged over 100 runs of a matrix
multiply between two float matrices.

To run:

./gemm_parallel [<int> USE_EXPLICIT_PARALELLISM 0/1] [<int> LEADING_DIMENSION]

## array.h
#ifndef RTC_ARRAY_H
#define RTC_ARRAY_H

#include <vector>
#include <string>
#include <memory>
#include <sstream>
#include <iostream>

#define XINLINE __device__ __host__

## openmp.cpp
// clang++ openmp.cpp -o openmp -fopenmp -O3 -std=c+11

#include <cassert>
#include <stdlib.h>
#include <cmath>
#include <omp.h>
#include <iostream>

template<typename T>
struct Vector {

## viterbi.py
import tensorflow as tf


def batch_gather_3d(values, indices):
    return tf.gather(tf.reshape(values, [-1, tf.shape(values)[2]]),
                     tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
                     indices)


def batch_gather_2d(values, indices):

## faux_cudnn.py
"""
Little script demonstration how to run cudnn rnns
without cudnn using dynamic rnn with the same weights
(e.g. train on cudnn, use with dynamic rnn on cpu).

Note: this will run slower than cudnn on a gpu (see below).
Tested on Titan X Pascal:
With cudnn 3.5s vs. with dynamic_rnn 8s to run through 79 batches
with batch size 128.
Network: input size: 127, 2 layer bidirectional LSTM with num_units 200.

## evolve_types.py
import random
from deap import algorithms, base, creator, tools
import numpy as np

domains = 100
num_entities = 10000
entity_num_domains = 5
num_mentions = 200
classifications = np.random.binomial(
    1, np.ones(domains) * entity_num_domains / domains, size=(num_entities, domains)

## plan.py
"""
Micro-dali JIT Plan:
- contains gemm, operator fusion, elementwise/reduction ops.
- supports tensordot
- supports 'jit'
- supports conversion from gemm + im2col to conv2d (NHWC)
- supports 'optimization' passes
- supports 'implementation' registries for specialization
  (e.g. int vs float)
	/*
	Reduction over all dimensions in mshadow. Requires changing the structs in
	mshadow expression to store their input expressions by value instead of
	reference.

	Installation:

	nvcc some_file.cu -std=c++11 -O3 -w -o some_file -I /usr/local/include

	Usage:
	#include <vector>
	#include <string>
	#include <memory>
	#include <sstream>
	#include <iostream>

	#define XINLINE __device__ __host__
	#define MAX_DIM 10
	#define INDENT_INCREMENT 2
	def listify(x):
	if isinstance(x, tuple):
	return list(x)
	return x

	def awesome_scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
	swap_memory=False, name=None):
	"""scan on the list of tensors unpacked from `elems` on dimension 0.
	This scan operator repeatedly applies the callable `fn` to a sequence
	of elements from first to last. The elements are made of the tensors
	/*
	Comparing explicit BLAS parallelism using a thread pool
	with vendor-implemented parallelism.

	Program prints the runtime averaged over 100 runs of a matrix
	multiply between two float matrices.

	To run:

	./gemm_parallel [<int> USE_EXPLICIT_PARALELLISM 0/1] [<int> LEADING_DIMENSION]
	#ifndef RTC_ARRAY_H
	#define RTC_ARRAY_H

	#include <vector>
	#include <string>
	#include <memory>
	#include <sstream>
	#include <iostream>

	#define XINLINE __device__ __host__
	// clang++ openmp.cpp -o openmp -fopenmp -O3 -std=c+11

	#include <cassert>
	#include <stdlib.h>
	#include <cmath>
	#include <omp.h>
	#include <iostream>

	template<typename T>
	struct Vector {
	import tensorflow as tf


	def batch_gather_3d(values, indices):
	return tf.gather(tf.reshape(values, [-1, tf.shape(values)[2]]),
	tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
	indices)


	def batch_gather_2d(values, indices):
	"""
	Little script demonstration how to run cudnn rnns
	without cudnn using dynamic rnn with the same weights
	(e.g. train on cudnn, use with dynamic rnn on cpu).

	Note: this will run slower than cudnn on a gpu (see below).
	Tested on Titan X Pascal:
	With cudnn 3.5s vs. with dynamic_rnn 8s to run through 79 batches
	with batch size 128.
	Network: input size: 127, 2 layer bidirectional LSTM with num_units 200.
	import random
	from deap import algorithms, base, creator, tools
	import numpy as np

	domains = 100
	num_entities = 10000
	entity_num_domains = 5
	num_mentions = 200
	classifications = np.random.binomial(
	1, np.ones(domains) * entity_num_domains / domains, size=(num_entities, domains)
	"""
	Micro-dali JIT Plan:
	- contains gemm, operator fusion, elementwise/reduction ops.
	- supports tensordot
	- supports 'jit'
	- supports conversion from gemm + im2col to conv2d (NHWC)
	- supports 'optimization' passes
	- supports 'implementation' registries for specialization
	(e.g. int vs float)