Skip to content

Instantly share code, notes, and snippets.

# Import the torch library, which provides tools for machine learning
import torch
# Import the Jamba model from the jamba.model module
from jamba.model import Jamba
torch.manual_seed(123)
# Create a tensor of random integers between 0 and 100, with shape (1, 100)
# This simulates a batch of tokens that we will pass through the model
#0 0x00007ffff7c8b94c in __pthread_kill_implementation () from /lib64/libc.so.6
#1 0x00007ffff7c3e646 in raise () from /lib64/libc.so.6
#2 0x00007ffff7c287f3 in abort () from /lib64/libc.so.6
#3 0x00007ffff66b135a in __cxxabiv1::__terminate (handler=<optimized out>) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_terminate.cc:48
#4 0x00007ffff66b03b9 in __cxa_call_terminate (ue_header=0x14225d90) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_call.cc:54
#5 0x00007ffff66b0ae7 in __cxxabiv1::__gxx_personality_v0 (version=<optimized out>, actions=6, exception_class=5138137972254386944, ue_header=0x14225d90, context=<optimized out>) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libstdc++-v3/libsupc++/eh_personality.cc:685
#6 0x00007ffff74f51e4 in _Unwind_RaiseException_Phase2 (exc=0x14225d90, context=0x7fffffffbdb0, frames_p=0x7fffffffbcb8) at /opt/conda/conda-bld/gcc-compiler_1654084175708/work/gcc/libgcc/unwind.inc
import torch
from time import time
import numpy as np
import os
import sys
from torch.profiler import profile, record_function, ProfilerActivity
times=[]
from transformers import AutoImageProcessor, ViTForImageClassification
import torch
import torch._inductor.config as config
config.cpp.weight_prepack=True
config.freezing=True
from time import time
import numpy as np
import os
(benchmarks) [17:02:01] ~/pytorch (main) > TORCHINDUCTOR_COMPILE_THREADS=1 pytest test/inductor/test_compiled_autograd.py -k 'test_torch_compile or test_access_saved_tensor_twice_without_recomputation_works'
==================================================================================================== test session starts =====================================================================================================
platform linux -- Python 3.10.14, pytest-8.1.1, pluggy-1.4.0
benchmark: 4.0.0 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
rootdir: /home/xmfan/pytorch
configfile: pytest.ini
plugins: hypothesis-6.100.1, benchmark-4.0.0, hydra-core-1.3.2, typeguard-4.2.1
collected 455 items / 453 deselected / 2 selected
Running 2 items in this shard
test/inductor/test_compiled_autograd.py .F
#include "/tmp/tmp8wdgz8ol/rq/crq573iugmokkndxawm743sgoqnmhemtfiwhap5ducjuyma5rxco.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
float* out_ptr0,
float* out_ptr1)
{
{
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(4L); x0+=static_cast<long>(1L))
#include "/tmp/tmp8wdgz8ol/rq/crq573iugmokkndxawm743sgoqnmhemtfiwhap5ducjuyma5rxco.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
float* out_ptr0)
{
{
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(5L); x0+=static_cast<long>(1L))
{
@xmfan
xmfan / fusedrms.py
Created May 3, 2024 18:58
fused rms
import math
import torch
import triton
import triton.language as tl
@triton.autotune(
configs=[
triton.Config({}, num_warps=1),
triton.Config({}, num_warps=2),
triton.Config({}, num_warps=4),
@xmfan
xmfan / out.py
Created April 2, 2024 00:57
test_basic, before boxing inputs
/home/xmfan/.conda/envs/benchmarks/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
_torch_pytree._register_pytree_node(
INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH
===== Compiled autograd graph =====
<eval_with_key>.0 class CompiledAutograd(torch.nn.Module):
def forward(self, inputs, sizes, hooks):
# No stacktrace found for following nodes
getitem: "f32[]" = inputs[0]
getitem_1: "f32[2, 4]" = inputs[1]
getitem_2: "f32[2, 4]" = inputs[2]
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile