Yaroslav Bulatov yaroslavvb

## install_pdb_handler.py
def install_pdb_handler():
    """Signals to automatically start pdb:
      1. CTRL+\\ breaks into pdb.
      2. pdb gets launched on exception.
    """

    import signal
    import pdb

    def handler(_signum, _frame):

## simple_signal.py
"""Example of launching distributed service and then bringint it down."""

import subprocess
import tensorflow as tf
import time
import sys

flags = tf.flags
flags.DEFINE_string("port1", "12222", "port of worker1")
flags.DEFINE_string("port2", "12223", "port of worker2")

## simple_barrier.py
"""Example of barrier implementation using TensorFlow shared variables.

All workers synchronize on barrier, copy global parameters to local versions
and increment global parameter variable asynchronously. Should see something
like this:

bash> killall python
bash> python simple_barrier.py --num_workers=4
worker  0, local_param  4 global_param  5
worker  2, local_param  4 global_param  7

## sharded_ps_benchmark.py
#!/usr/bin/env python
# Benchmark transferring data, part of troubleshooting https://github.com/tensorflow/tensorflow/issues/6116
#
# Take a independent workers communicating with b parameter shards
# Each worker tries to add to variables stored on parameter server as fast as
# possible.
#
# macbook
# ps=1: 1.6 GB/s
# ps=2: 2.6 GB/s

## benchmark_grpc_recv.py
# Dependencies:
# portpicker (pip install portpicker)
# tcmalloc4 (sudo apt-get install google-perftools)
# TF 0.12
#
#
# Benchmarks on Xeon E5-2630 v3 @ 2.40GHz
#
# export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
# python benchmark_grpc_recv.py --data_mb=128

## gist:b73ff35424dd7ab762234620cf583aac
# try running cpu intensive test on two devices

import tensorflow as tf
import time

def matmul_op():
  """Multiply two matrices together"""

  n = 2000
  a = tf.ones((n, n), dtype=tf.float32)

## lyap2.m
function X = lyap2(A, B, C)
%LYAP2  Lyapunov equation solution using eigenvalue decomposition.
%   X = LYAP2(A,C) solves the special form of the Lyapunov matrix
%   equation:
%
%       A*X + X*A' = -C
%
%   X = LYAP2(A,B,C) solves the general form of the Lyapunov matrix
%   equation:
%

## local_distributed_benchmark.py
"""Benchmark tensorflow distributed by adding vector of ones on worker2
to variable on worker1 as fast as possible.

On 2014 macbook, TensorFlow 0.10 this shows

Local rate:       2175.28 MB per second
Distributed rate: 107.13 MB per second

"""

## hessian_test.py
def test():
    u.seed_random(1)

    data_width = 3
    targets_width = 2
    batch_size = 3
    dataset = TinyMNIST('/tmp', download=True, data_width=data_width, targets_width=targets_width, dataset_size=batch_size)
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    d1 = data_width ** 2   # hidden layer size, visible size, output size

## print_schedule.py
# An example of turning contraction order into sequence of einsum calls

from opt_einsum import helpers as oe_helpers
import opt_einsum as oe

def print_schedule(path, indices, output_subscript, terms):
    """

    Args:
        path: contraction path in einsum optimizer format, ie, [(0,), (2,), (1, 3), (0, 2), (0, 1)]
	def install_pdb_handler():
	"""Signals to automatically start pdb:
	1. CTRL+\\ breaks into pdb.
	2. pdb gets launched on exception.
	"""

	import signal
	import pdb

	def handler(_signum, _frame):
	"""Example of launching distributed service and then bringint it down."""

	import subprocess
	import tensorflow as tf
	import time
	import sys

	flags = tf.flags
	flags.DEFINE_string("port1", "12222", "port of worker1")
	flags.DEFINE_string("port2", "12223", "port of worker2")
	"""Example of barrier implementation using TensorFlow shared variables.

	All workers synchronize on barrier, copy global parameters to local versions
	and increment global parameter variable asynchronously. Should see something
	like this:

	bash> killall python
	bash> python simple_barrier.py --num_workers=4
	worker 0, local_param 4 global_param 5
	worker 2, local_param 4 global_param 7
	#!/usr/bin/env python
	# Benchmark transferring data, part of troubleshooting https://github.com/tensorflow/tensorflow/issues/6116
	#
	# Take a independent workers communicating with b parameter shards
	# Each worker tries to add to variables stored on parameter server as fast as
	# possible.
	#
	# macbook
	# ps=1: 1.6 GB/s
	# ps=2: 2.6 GB/s
	# Dependencies:
	# portpicker (pip install portpicker)
	# tcmalloc4 (sudo apt-get install google-perftools)
	# TF 0.12
	#
	#
	# Benchmarks on Xeon E5-2630 v3 @ 2.40GHz
	#
	# export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
	# python benchmark_grpc_recv.py --data_mb=128
	# try running cpu intensive test on two devices

	import tensorflow as tf
	import time

	def matmul_op():
	"""Multiply two matrices together"""

	n = 2000
	a = tf.ones((n, n), dtype=tf.float32)
	function X = lyap2(A, B, C)
	%LYAP2 Lyapunov equation solution using eigenvalue decomposition.
	% X = LYAP2(A,C) solves the special form of the Lyapunov matrix
	% equation:
	%
	% AX + XA' = -C
	%
	% X = LYAP2(A,B,C) solves the general form of the Lyapunov matrix
	% equation:
	%
	"""Benchmark tensorflow distributed by adding vector of ones on worker2
	to variable on worker1 as fast as possible.

	On 2014 macbook, TensorFlow 0.10 this shows

	Local rate: 2175.28 MB per second
	Distributed rate: 107.13 MB per second

	"""
	def test():
	u.seed_random(1)

	data_width = 3
	targets_width = 2
	batch_size = 3
	dataset = TinyMNIST('/tmp', download=True, data_width=data_width, targets_width=targets_width, dataset_size=batch_size)
	trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

	d1 = data_width ** 2 # hidden layer size, visible size, output size
	# An example of turning contraction order into sequence of einsum calls

	from opt_einsum import helpers as oe_helpers
	import opt_einsum as oe

	def print_schedule(path, indices, output_subscript, terms):
	"""

	Args:
	path: contraction path in einsum optimizer format, ie, [(0,), (2,), (1, 3), (0, 2), (0, 1)]