Simon Fan xmfan

## simple_train.py
# Import the torch library, which provides tools for machine learning
import torch

# Import the Jamba model from the jamba.model module
from jamba.model import Jamba

torch.manual_seed(123)

# Create a tensor of random integers between 0 and 100, with shape (1, 100)
# This simulates a batch of tokens that we will pass through the model

## gist:d2dcddda2f042df35832992753e3df34
#0  0x00007ffff7c8b94c in __pthread_kill_implementation () from /lib64/libc.so.6
#1  0x00007ffff7c3e646 in raise () from /lib64/libc.so.6
#2  0x00007ffff7c287f3 in abort () from /lib64/libc.so.6
#3  0x00007ffff66b135a in __cxxabiv1::__terminate (handler=<optimized out>) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_terminate.cc:48
#4  0x00007ffff66b03b9 in __cxa_call_terminate (ue_header=0x14225d90) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_call.cc:54
#5  0x00007ffff66b0ae7 in __cxxabiv1::__gxx_personality_v0 (version=<optimized out>, actions=6, exception_class=5138137972254386944, ue_header=0x14225d90, context=<optimized out>) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_personality.cc:685
#6  0x00007ffff74f51e4 in _Unwind_RaiseException_Phase2 (exc=0x14225d90, context=0x7fffffffbdb0, frames_p=0x7fffffffbcb8) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libgcc/unwind.inc

## vit.py
import torch
from time import time

import numpy as np
import os
import sys
from torch.profiler import profile, record_function, ProfilerActivity
times=[]

from transformers import AutoImageProcessor, ViTForImageClassification

## vit_with_weight_prepacking.py
import torch

import torch._inductor.config as config
config.cpp.weight_prepack=True
config.freezing=True

from time import time

import numpy as np
import os

## gist:7180f97a1ca97864112361e1208a69ed
(benchmarks) [17:02:01] ~/pytorch (main) > TORCHINDUCTOR_COMPILE_THREADS=1 pytest test/inductor/test_compiled_autograd.py -k 'test_torch_compile or test_access_saved_tensor_twice_without_recomputation_works'
==================================================================================================== test session starts =====================================================================================================
platform linux -- Python 3.10.14, pytest-8.1.1, pluggy-1.4.0
benchmark: 4.0.0 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
rootdir: /home/xmfan/pytorch
configfile: pytest.ini
plugins: hypothesis-6.100.1, benchmark-4.0.0, hydra-core-1.3.2, typeguard-4.2.1
collected 455 items / 453 deselected / 2 selected
Running 2 items in this shard
test/inductor/test_compiled_autograd.py .F

## 1.cpp

#include "/tmp/tmp8wdgz8ol/rq/crq573iugmokkndxawm743sgoqnmhemtfiwhap5ducjuyma5rxco.h"
extern "C" void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       float* out_ptr0,
                       float* out_ptr1)
{
    {
        #pragma omp simd simdlen(8)
        for(long x0=static_cast<long>(0L); x0<static_cast<long>(4L); x0+=static_cast<long>(1L))

## 2.cpp

#include "/tmp/tmp8wdgz8ol/rq/crq573iugmokkndxawm743sgoqnmhemtfiwhap5ducjuyma5rxco.h"
extern "C" void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       float* out_ptr0)
{
    {
        #pragma omp simd simdlen(8)
        for(long x0=static_cast<long>(0L); x0<static_cast<long>(5L); x0+=static_cast<long>(1L))
        {

## fusedrms.py
import math
import torch
import triton
import triton.language as tl

@triton.autotune(
    configs=[
        triton.Config({}, num_warps=1),
        triton.Config({}, num_warps=2),
        triton.Config({}, num_warps=4),

## out.py
/home/xmfan/.conda/envs/benchmarks/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH
 ===== Compiled autograd graph =====
 <eval_with_key>.0 class CompiledAutograd(torch.nn.Module):
    def forward(self, inputs, sizes, hooks):
        # No stacktrace found for following nodes
        getitem: "f32[]" = inputs[0]
        getitem_1: "f32[2, 4]" = inputs[1]
        getitem_2: "f32[2, 4]" = inputs[2]

## codegened.py

from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
	# Import the torch library, which provides tools for machine learning
	import torch

	# Import the Jamba model from the jamba.model module
	from jamba.model import Jamba

	torch.manual_seed(123)

	# Create a tensor of random integers between 0 and 100, with shape (1, 100)
	# This simulates a batch of tokens that we will pass through the model
	#0 0x00007ffff7c8b94c in __pthread_kill_implementation () from /lib64/libc.so.6
	#1 0x00007ffff7c3e646 in raise () from /lib64/libc.so.6
	#2 0x00007ffff7c287f3 in abort () from /lib64/libc.so.6
	#3 0x00007ffff66b135a in __cxxabiv1::__terminate (handler=<optimized out>) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_terminate.cc:48
	#4 0x00007ffff66b03b9 in __cxa_call_terminate (ue_header=0x14225d90) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_call.cc:54
	#5 0x00007ffff66b0ae7 in __cxxabiv1::__gxx_personality_v0 (version=<optimized out>, actions=6, exception_class=5138137972254386944, ue_header=0x14225d90, context=<optimized out>) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_personality.cc:685
	#6 0x00007ffff74f51e4 in _Unwind_RaiseException_Phase2 (exc=0x14225d90, context=0x7fffffffbdb0, frames_p=0x7fffffffbcb8) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libgcc/unwind.inc
	import torch
	from time import time

	import numpy as np
	import os
	import sys
	from torch.profiler import profile, record_function, ProfilerActivity
	times=[]

	from transformers import AutoImageProcessor, ViTForImageClassification
	import torch

	import torch._inductor.config as config
	config.cpp.weight_prepack=True
	config.freezing=True

	from time import time

	import numpy as np
	import os
	(benchmarks) [17:02:01] ~/pytorch (main) > TORCHINDUCTOR_COMPILE_THREADS=1 pytest test/inductor/test_compiled_autograd.py -k 'test_torch_compile or test_access_saved_tensor_twice_without_recomputation_works'
	==================================================================================================== test session starts =====================================================================================================
	platform linux -- Python 3.10.14, pytest-8.1.1, pluggy-1.4.0
	benchmark: 4.0.0 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
	rootdir: /home/xmfan/pytorch
	configfile: pytest.ini
	plugins: hypothesis-6.100.1, benchmark-4.0.0, hydra-core-1.3.2, typeguard-4.2.1
	collected 455 items / 453 deselected / 2 selected
	Running 2 items in this shard
	test/inductor/test_compiled_autograd.py .F

	#include "/tmp/tmp8wdgz8ol/rq/crq573iugmokkndxawm743sgoqnmhemtfiwhap5ducjuyma5rxco.h"
	extern "C" void kernel(const float* in_ptr0,
	const float* in_ptr1,
	float* out_ptr0,
	float* out_ptr1)
	{
	{
	#pragma omp simd simdlen(8)
	for(long x0=static_cast<long>(0L); x0<static_cast<long>(4L); x0+=static_cast<long>(1L))
	import math
	import torch
	import triton
	import triton.language as tl

	@triton.autotune(
	configs=[
	triton.Config({}, num_warps=1),
	triton.Config({}, num_warps=2),
	triton.Config({}, num_warps=4),
	/home/xmfan/.conda/envs/benchmarks/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
	_torch_pytree._register_pytree_node(
	INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH
	===== Compiled autograd graph =====
	<eval_with_key>.0 class CompiledAutograd(torch.nn.Module):
	def forward(self, inputs, sizes, hooks):
	# No stacktrace found for following nodes
	getitem: "f32[]" = inputs[0]
	getitem_1: "f32[2, 4]" = inputs[1]
	getitem_2: "f32[2, 4]" = inputs[2]

	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile