Alexander Pivovarov apivovarov

## start-docker.sh
docker run -ti --name u2204 -v ~/workspace:/home/joshcao/workspace ubuntu:22.04

U=pivovaa

apt update
apt install -y adduser sudo vim wget curl \
libssl-dev \
python3 python3-pip

adduser $U

## hlo-opt-help.txt
This tool lets you run a given HloModule from a file (or stdin) and convert it
to expanded HLO, fully optimized HLO, or a binary depending on options.

HLO passes are always run, unless the HLO module is already scheduled (has
is_scheduled=True).

You can also pass in debug option flags for the HloModule.

Usage:

## grad.py
a = 5
b = 6
y = a/b

h = 0.00001

def dy_da_f():
  a2 = a + h
  y2 = a2 / b
  dy_da = (y2 - y) / h

## softmax_jax_hlo.py
import jax
from jax import Array
import jax.numpy as jnp

def init_params(key: Array, shape) -> Array:
    return jax.random.normal(key, shape).astype(jax.dtypes.bfloat16)

def softmax(x):
    mx = x.max(axis=-1, keepdims=True)
    mx = jax.lax.stop_gradient(mx)

## gelu_approx.hlo.txt
HloModule xla_computation_ff, entry_computation_layout={(f32[1,224,224,3]{3,2,1,0})->(f32[1,224,224,3]{3,2,1,0})}

ENTRY main.20 {
  Arg_0.1 = f32[1,224,224,3]{3,2,1,0} parameter(0)
  multiply.10 = f32[1,224,224,3]{3,2,1,0} multiply(Arg_0.1, Arg_0.1)
  multiply.11 = f32[1,224,224,3]{3,2,1,0} multiply(Arg_0.1, multiply.10)
  constant.8 = f32[] constant(0.044715)
  broadcast.9 = f32[1,224,224,3]{3,2,1,0} broadcast(constant.8), dimensions={}
  multiply.12 = f32[1,224,224,3]{3,2,1,0} multiply(multiply.11, broadcast.9)
  add.13 = f32[1,224,224,3]{3,2,1,0} add(Arg_0.1, multiply.12)

## softmax.hlo.txt
HloModule xla_computation_ff, entry_computation_layout={(f32[4,1000]{1,0})->(f32[4,1000]{1,0})}

region_0.4 {
  Arg_0.5 = f32[] parameter(0)
  Arg_1.6 = f32[] parameter(1)
  ROOT maximum.7 = f32[] maximum(Arg_0.5, Arg_1.6)
}

region_1.15 {
  Arg_0.16 = f32[] parameter(0)

## nrvo.cc
#include <iostream>
#include <vector>

std::vector<int> testNRVO(int value, size_t size, const std::vector<int> **localVec)
{
   std::vector<int> vec(size, value);

   *localVec = &vec;

   /* Do something here.. */

## test-roberta-pt.py
import torch
from transformers import RobertaTokenizer, RobertaModel

torch.set_grad_enabled(False)

class RobertaTraceWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    def forward(self, x):

## cuda_check.cu
// To compile - nvcc cuda_check.cu -o cuda_check -lcuda
// To run ./cuda_check
// set g++ path to older g++ if needed - export NVCC_PREPEND_FLAGS='-ccbin
// /usr/local/gcc-11/bin/g++-11'
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <stdio.h>

/* Outputs some information on CUDA-enabled devices on your computer,
 * including compute capability and current memory usage.

## fcn.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchinfo import summary

in_sz = 28*28

n_epochs = 1
	docker run -ti --name u2204 -v ~/workspace:/home/joshcao/workspace ubuntu:22.04

	U=pivovaa

	apt update
	apt install -y adduser sudo vim wget curl \
	libssl-dev \
	python3 python3-pip

	adduser $U
	This tool lets you run a given HloModule from a file (or stdin) and convert it
	to expanded HLO, fully optimized HLO, or a binary depending on options.

	HLO passes are always run, unless the HLO module is already scheduled (has
	is_scheduled=True).

	You can also pass in debug option flags for the HloModule.

	Usage:
	a = 5
	b = 6
	y = a/b

	h = 0.00001

	def dy_da_f():
	a2 = a + h
	y2 = a2 / b
	dy_da = (y2 - y) / h
	import jax
	from jax import Array
	import jax.numpy as jnp

	def init_params(key: Array, shape) -> Array:
	return jax.random.normal(key, shape).astype(jax.dtypes.bfloat16)

	def softmax(x):
	mx = x.max(axis=-1, keepdims=True)
	mx = jax.lax.stop_gradient(mx)
	HloModule xla_computation_ff, entry_computation_layout={(f32[1,224,224,3]{3,2,1,0})->(f32[1,224,224,3]{3,2,1,0})}

	ENTRY main.20 {
	Arg_0.1 = f32[1,224,224,3]{3,2,1,0} parameter(0)
	multiply.10 = f32[1,224,224,3]{3,2,1,0} multiply(Arg_0.1, Arg_0.1)
	multiply.11 = f32[1,224,224,3]{3,2,1,0} multiply(Arg_0.1, multiply.10)
	constant.8 = f32[] constant(0.044715)
	broadcast.9 = f32[1,224,224,3]{3,2,1,0} broadcast(constant.8), dimensions={}
	multiply.12 = f32[1,224,224,3]{3,2,1,0} multiply(multiply.11, broadcast.9)
	add.13 = f32[1,224,224,3]{3,2,1,0} add(Arg_0.1, multiply.12)
	HloModule xla_computation_ff, entry_computation_layout={(f32[4,1000]{1,0})->(f32[4,1000]{1,0})}

	region_0.4 {
	Arg_0.5 = f32[] parameter(0)
	Arg_1.6 = f32[] parameter(1)
	ROOT maximum.7 = f32[] maximum(Arg_0.5, Arg_1.6)
	}

	region_1.15 {
	Arg_0.16 = f32[] parameter(0)
	#include <iostream>
	#include <vector>

	std::vector<int> testNRVO(int value, size_t size, const std::vector<int> **localVec)
	{
	std::vector<int> vec(size, value);

	*localVec = &vec;

	/* Do something here.. */
	import torch
	from transformers import RobertaTokenizer, RobertaModel

	torch.set_grad_enabled(False)

	class RobertaTraceWrapper(torch.nn.Module):
	def __init__(self, model):
	super().__init__()
	self.model = model
	def forward(self, x):
	// To compile - nvcc cuda_check.cu -o cuda_check -lcuda
	// To run ./cuda_check
	// set g++ path to older g++ if needed - export NVCC_PREPEND_FLAGS='-ccbin
	// /usr/local/gcc-11/bin/g++-11'
	#include <cuda.h>
	#include <cuda_runtime_api.h>
	#include <stdio.h>

	/* Outputs some information on CUDA-enabled devices on your computer,
	* including compute capability and current memory usage.
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	import torchvision
	from torchinfo import summary

	in_sz = 28*28

	n_epochs = 1