Jonathan Raiman JonathanRaiman

## save_submission.py
import numpy as np
def save_submission(predictions, filename):
    "Take model output & save for cardinality estimation benchmark upload."""
    np.save(filename, np.array([value for item in predictions for key, value in sorted(item.items())]))
# preds = rf.test(testqs) # run your prediction code on the test data
save_submission(preds, "mysubmission.npy")
# Then Upload "mysubmission.npy" to the leaderboard https://mlforsystems.wl.r.appspot.com :)

## google_tsp_solver.py
from ortools.constraint_solver import pywrapcp
import numpy as np


def _create_distance_callback(dist_matrix):
    # Create a callback to calculate distances between cities.
    def distance_callback(from_node, to_node):
        return int(dist_matrix[from_node][to_node])
    return distance_callback

## graph_optimization.py
from contextlib import contextmanager
import time

CURRENT_SCOPE = []

@contextmanager
def printing_scope(message):
    CURRENT_SCOPE.append(message)
    yield
    last = CURRENT_SCOPE.pop()

## clear_subl.sh
function clear_subl {
   python3  -c "path = '~/Library/Application Support/Sublime Text 3/Local/Auto Save Session.sublime_session'; import os, json; data = json.load(open(os.path.expanduser(path), 'rt')); data['windows'] = []; json.dump(data, open(os.path.expanduser(path), 'wt'))"
}

## access_pattern.py
"""
Access Pattern Search
---------------------

Code for simulating the effect of searching for the right access pattern in
a CUDA Kernel computation directed acyclic graph.

The key idea is to have every node in the computation graph return an object
representing "for loops" that can be optionally parallelized using blocks
or threads (followed by syncs).

## human.cpp
auto a = op::uniform(-20.0, 20.0, {2, 5}).astype(dtype);
a.eval();
auto exped = op::exp(a - op::max(a, {-1}, true));
auto fused_softmax = exped / op::sum(exped, {-1}, true);

## plan.py
"""
Micro-dali JIT Plan:
- contains gemm, operator fusion, elementwise/reduction ops.
- supports tensordot
- supports 'jit'
- supports conversion from gemm + im2col to conv2d (NHWC)
- supports 'optimization' passes
- supports 'implementation' registries for specialization
  (e.g. int vs float)

## evolve_types.py
import random
from deap import algorithms, base, creator, tools
import numpy as np

domains = 100
num_entities = 10000
entity_num_domains = 5
num_mentions = 200
classifications = np.random.binomial(
    1, np.ones(domains) * entity_num_domains / domains, size=(num_entities, domains)

## faux_cudnn.py
"""
Little script demonstration how to run cudnn rnns
without cudnn using dynamic rnn with the same weights
(e.g. train on cudnn, use with dynamic rnn on cpu).

Note: this will run slower than cudnn on a gpu (see below).
Tested on Titan X Pascal:
With cudnn 3.5s vs. with dynamic_rnn 8s to run through 79 batches
with batch size 128.
Network: input size: 127, 2 layer bidirectional LSTM with num_units 200.

## viterbi.py
import tensorflow as tf


def batch_gather_3d(values, indices):
    return tf.gather(tf.reshape(values, [-1, tf.shape(values)[2]]),
                     tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
                     indices)


def batch_gather_2d(values, indices):
	import numpy as np
	def save_submission(predictions, filename):
	"Take model output & save for cardinality estimation benchmark upload."""
	np.save(filename, np.array([value for item in predictions for key, value in sorted(item.items())]))
	# preds = rf.test(testqs) # run your prediction code on the test data
	save_submission(preds, "mysubmission.npy")
	# Then Upload "mysubmission.npy" to the leaderboard https://mlforsystems.wl.r.appspot.com :)
	from ortools.constraint_solver import pywrapcp
	import numpy as np


	def _create_distance_callback(dist_matrix):
	# Create a callback to calculate distances between cities.
	def distance_callback(from_node, to_node):
	return int(dist_matrix[from_node][to_node])
	return distance_callback
	from contextlib import contextmanager
	import time

	CURRENT_SCOPE = []

	@contextmanager
	def printing_scope(message):
	CURRENT_SCOPE.append(message)
	yield
	last = CURRENT_SCOPE.pop()
	function clear_subl {
	python3 -c "path = '~/Library/Application Support/Sublime Text 3/Local/Auto Save Session.sublime_session'; import os, json; data = json.load(open(os.path.expanduser(path), 'rt')); data['windows'] = []; json.dump(data, open(os.path.expanduser(path), 'wt'))"
	}
	"""
	Access Pattern Search
	---------------------

	Code for simulating the effect of searching for the right access pattern in
	a CUDA Kernel computation directed acyclic graph.

	The key idea is to have every node in the computation graph return an object
	representing "for loops" that can be optionally parallelized using blocks
	or threads (followed by syncs).
	auto a = op::uniform(-20.0, 20.0, {2, 5}).astype(dtype);
	a.eval();
	auto exped = op::exp(a - op::max(a, {-1}, true));
	auto fused_softmax = exped / op::sum(exped, {-1}, true);
	"""
	Micro-dali JIT Plan:
	- contains gemm, operator fusion, elementwise/reduction ops.
	- supports tensordot
	- supports 'jit'
	- supports conversion from gemm + im2col to conv2d (NHWC)
	- supports 'optimization' passes
	- supports 'implementation' registries for specialization
	(e.g. int vs float)
	import random
	from deap import algorithms, base, creator, tools
	import numpy as np

	domains = 100
	num_entities = 10000
	entity_num_domains = 5
	num_mentions = 200
	classifications = np.random.binomial(
	1, np.ones(domains) * entity_num_domains / domains, size=(num_entities, domains)
	"""
	Little script demonstration how to run cudnn rnns
	without cudnn using dynamic rnn with the same weights
	(e.g. train on cudnn, use with dynamic rnn on cpu).

	Note: this will run slower than cudnn on a gpu (see below).
	Tested on Titan X Pascal:
	With cudnn 3.5s vs. with dynamic_rnn 8s to run through 79 batches
	with batch size 128.
	Network: input size: 127, 2 layer bidirectional LSTM with num_units 200.
	import tensorflow as tf


	def batch_gather_3d(values, indices):
	return tf.gather(tf.reshape(values, [-1, tf.shape(values)[2]]),
	tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
	indices)


	def batch_gather_2d(values, indices):