tugsbayasgalan/good

## good
<frozen importlib._bootstrap_external>:1184: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
W0512 13:27:46.019000 3957859 site-packages/torch/_export/non_strict_utils.py:499] dimension inputs['z'].shape[0] 0/1 specialized; Dim.AUTO was specified along with a sample input with hint = 1.
V0512 13:27:56.870000 3957859 site-packages/torch/_inductor/codegen/wrapper.py:1471] [__output_code] Auto-tuning code written to /tmp/torchinductor_tmanlaibaatar/tmpsy995vep.py
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] Output wrapper code:
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <torch/csrc/inductor/aoti_include/cuda.h>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // Definition of AOTI runtime interface functions
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <torch/csrc/inductor/aoti_runtime/interface.h>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <torch/csrc/inductor/aoti_runtime/model_container.h>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <iostream>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <sstream>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <stdexcept>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <vector>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #define CONVERT_EXCEPTION_TO_ERROR_CODE(...)                 \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   try {                                                      \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     __VA_ARGS__                                              \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   } catch (const std::exception& e) {                        \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     std::cerr << "Error: " << e.what() << std::endl;         \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     return AOTI_RUNTIME_FAILURE;                             \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   } catch (...) {                                            \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     std::cerr << "Unknown exception occurred." << std::endl; \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     return AOTI_RUNTIME_FAILURE;                             \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   }                                                          \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   return AOTI_RUNTIME_SUCCESS;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   do {                                                            \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTI_RUNTIME_CHECK(                                           \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         actual_size == expected_size,                             \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         "expected " + std::string(name) + " vector size to be " + \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             std::to_string(expected_size) + ", but got " +        \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             std::to_string(actual_size));                         \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   } while (0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // AOTInductor uses at::addmm_out, which doesn't supports
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // arguments that requires gradient. For this reason, we
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // enforce no_grad context for run APIs.
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] //
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // A RAII, thread local (!) guard that enables or disables grad mode upon
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // construction, and sets it back to the original value upon destruction.
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] struct AOTINoGradGuard {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   AOTINoGradGuard() : prev_mode(aoti_torch_grad_mode_is_enabled()) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     aoti_torch_grad_mode_set_enabled(false);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   ~AOTINoGradGuard() {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     aoti_torch_grad_mode_set_enabled(prev_mode);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   bool prev_mode;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] };
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] extern "C" {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerCreate(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle* container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t num_models,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool is_cpu,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char* cubin_dir) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       return AOTInductorModelContainerCreateWithDevice(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         num_models,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         is_cpu ? "cpu" : "cuda",
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         cubin_dir);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle* container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t num_models,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char* device_str,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char* cubin_dir) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   if (num_models == 0) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     std::cerr << "Error: num_models must be positive, but got 0" << std::endl;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     return AOTI_RUNTIME_FAILURE;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     std::optional<std::string> cubin_dir_opt;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (cubin_dir != nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       cubin_dir_opt.emplace(cubin_dir);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     auto* container = new torch::aot_inductor::AOTInductorModelContainer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         num_models, std::string(device_str), cubin_dir_opt);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     *container_handle =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reinterpret_cast<AOTInductorModelContainerHandle>(container);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerDelete(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     delete container;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   });
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerRun(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                                      // are stolen; the array itself is borrowed
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t num_inputs,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AtenTensorHandle*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         output_handles, // array for writing output AtenTensorHandle; handles
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                         // will be stolen by the caller; the array itself is
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                         // borrowed
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t num_outputs,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorStreamHandle stream_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTIProxyExecutorHandle proxy_executor_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto stream =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTINoGradGuard guard;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     container->run(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         input_handles, output_handles, stream, proxy_executor_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                                      // are stolen; the array itself is borrowed
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t num_inputs,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AtenTensorHandle*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         output_handles, // array for writing output AtenTensorHandle; handles
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                         // will be stolen by the caller; the array itself is
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                         // borrowed
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t num_outputs,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorStreamHandle stream_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTIProxyExecutorHandle proxy_executor_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto stream =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTINoGradGuard guard;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     container->run_single_threaded(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         input_handles, output_handles, stream, proxy_executor_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t* num_constants) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     { *num_constants = container->num_constants(); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantName(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t idx,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char** name) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     { *name = container->constant_name(idx); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t idx,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char** original_fqn) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     { *original_fqn = container->constant_original_fqn(idx); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t idx,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool* from_folded) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantType(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t idx,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t* type) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t idx,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t* dtype) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     { *dtype = container->constant_dtype(idx); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorConstantMapHandle constant_map_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool use_inactive) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     { const auto ret = container->extract_constants_map(use_inactive);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       for (const auto& pair: ret) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         constants_map->emplace(pair.first, pair.second);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorConstantMapHandle constant_map_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool use_inactive,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool validate_full_update) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     container->update_constant_buffer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorConstantMapHandle constant_map_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool use_inactive,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool validate_full_update) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     container->update_constant_buffer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         *input_map, use_inactive, validate_full_update);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorConstantMapHandle constant_map_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           constant_map_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           /*use_inactive*/ true,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           /*validate_full_update*/ true);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     container->free_inactive_constant_buffer();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool use_inactive,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorStreamHandle stream_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTIProxyExecutorHandle proxy_executor_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto stream =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTINoGradGuard guard;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     container->run_const_fold(use_inactive, stream, proxy_executor_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     container->swap_constant_buffer();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t* ret_num_inputs) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       { *ret_num_inputs = container->num_inputs(); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetInputName(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t input_idx,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char** ret_input_names) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       { *ret_input_names = container->input_name(input_idx); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t* ret_num_outputs) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       { *ret_num_outputs = container->num_outputs(); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetOutputName(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t output_idx,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char** ret_output_names) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       { *ret_output_names = container->output_name(output_idx); })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelContainerHandle container_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char** in_spec,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char** out_spec) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto* container =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           container_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     *in_spec = container->get_in_spec();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     *out_spec = container->get_out_spec();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelCreate(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelHandle* model_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorConstantMapHandle constant_map_handle){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       auto model = new torch::aot_inductor::AOTInductorModel(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           constant_map,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           constant_array,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           ""
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       );
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       if (input_map) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for (auto const& kv : *input_map) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           constant_map->emplace(kv.first, kv.second);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       } else {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         model->load_constants();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     })}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelRun(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelHandle model_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AtenTensorHandle* input_handles,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AtenTensorHandle* output_handles) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto model =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTINoGradGuard guard;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     model->run_impl(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         input_handles,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         output_handles,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         (torch::aot_inductor::DeviceStreamType) nullptr,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         nullptr);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]           model_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       delete model;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     })}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelGetNumOutputs(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelHandle model_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     size_t* ret_num_outputs) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       *ret_num_outputs = model->num_outputs();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorModelHandle model_handle,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTInductorConstantMapHandle constant_map_handle) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   auto model =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   CONVERT_EXCEPTION_TO_ERROR_CODE({
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     auto input_map =
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             constant_map_handle);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     for (auto const& kv : *input_map) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]       constant_map->emplace(kv.first, kv.second);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     model->update_constants_map(std::move(constant_map));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   })
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } // extern "C"
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #define CUDA_DRIVER_CHECK(EXPR)                    \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] do {                                               \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUresult code = EXPR;                          \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const char *msg;                               \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUresult code_get_error = cuGetErrorString(code, &msg); \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (code_get_error != CUDA_SUCCESS) {          \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         throw std::runtime_error(                  \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             std::string("CUDA driver error: ") +   \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             std::string("invalid error code!"));   \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }                                              \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (code != CUDA_SUCCESS) {                    \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         throw std::runtime_error(                  \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             std::string("CUDA driver error: ") +   \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             std::string(msg));                     \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }                                              \
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } while (0);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline CUfunction loadKernel(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         std::string filePath,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         const std::string &funcName,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         uint32_t sharedMemBytes,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         const std::optional<std::string> &cubinDir = std::nullopt) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (cubinDir) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         std::filesystem::path p1{*cubinDir};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         std::filesystem::path p2{filePath};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filePath = (p1 / p2.filename()).string();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUmodule mod;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction func;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (sharedMemBytes > 0) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         CUDA_DRIVER_CHECK(cuFuncSetAttribute(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             func,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             sharedMemBytes
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         ))
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     return func;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void launchKernel(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         CUfunction func,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         uint32_t gridX,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         uint32_t gridY,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         uint32_t gridZ,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         uint32_t numWarps,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         uint32_t sharedMemBytes,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         void* args[],
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         cudaStream_t stream) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUDA_DRIVER_CHECK(cuLaunchKernel(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_DTYPE(bfloat16);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_DTYPE(float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_DEVICE(cuda);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_LAYOUT(strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] namespace torch::aot_inductor {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] namespace {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]   public:
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_per_fused_native_group_norm_24{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_per_fused_native_group_norm_34{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_per_fused_native_group_norm_39{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_per_fused_native_group_norm_4{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__scaled_dot_product_efficient_attention_clone_11{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_48{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_49{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_60{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_0{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_1{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_10{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_2{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_40{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_55{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_47{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_51{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_56{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_59{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_62{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_64{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_36{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_37{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_42{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_52{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_53{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_57{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_6{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_65{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_66{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_67{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_poi_fused__to_copy_convolution_native_group_norm_9{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_13{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_15{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_18{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_20{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_23{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_26{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_29{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_3{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_33{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_38{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_43{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_46{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_50{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_54{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_58{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_61{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_63{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_7{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUfunction triton_red_fused_native_group_norm_8{nullptr};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] };
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }  // namespace
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                                    std::shared_ptr<std::vector<ConstantHandle>> constants_array,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                                    const std::string& device_str,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                                    std::optional<std::string> cubin_dir,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                                    bool include_weights)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     : AOTInductorModelBase(1, 1, 137, device_str, cubin_dir, true) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     inputs_info_[0].name = "arg138_1";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].name = "conv_in_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].data_size = 294912;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].shape = {512, 16, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].stride = {144, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[0].original_fqn = "conv_in.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].name = "conv_in_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[1].original_fqn = "conv_in.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].name = "mid_block_1_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[2].original_fqn = "mid.block_1.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].name = "mid_block_1_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[3].original_fqn = "mid.block_1.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].name = "mid_block_1_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[4].original_fqn = "mid.block_1.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].name = "mid_block_1_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[5].original_fqn = "mid.block_1.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].name = "mid_block_1_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[6].original_fqn = "mid.block_1.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].name = "mid_block_1_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[7].original_fqn = "mid.block_1.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].name = "mid_block_1_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[8].original_fqn = "mid.block_1.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].name = "mid_block_1_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[9].original_fqn = "mid.block_1.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].name = "mid_attn_1_norm_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[10].original_fqn = "mid.attn_1.norm.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].name = "mid_attn_1_norm_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[11].original_fqn = "mid.attn_1.norm.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].name = "mid_attn_1_q_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].data_size = 1048576;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].shape = {512, 512, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].stride = {512, 1, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[12].original_fqn = "mid.attn_1.q.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].name = "mid_attn_1_q_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[13].original_fqn = "mid.attn_1.q.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].name = "mid_attn_1_k_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].data_size = 1048576;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].shape = {512, 512, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].stride = {512, 1, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[14].original_fqn = "mid.attn_1.k.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].name = "mid_attn_1_k_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[15].original_fqn = "mid.attn_1.k.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].name = "mid_attn_1_v_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].data_size = 1048576;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].shape = {512, 512, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].stride = {512, 1, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[16].original_fqn = "mid.attn_1.v.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].name = "mid_attn_1_v_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[17].original_fqn = "mid.attn_1.v.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].name = "mid_attn_1_proj_out_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].data_size = 1048576;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].shape = {512, 512, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].stride = {512, 1, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[18].original_fqn = "mid.attn_1.proj_out.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].name = "mid_attn_1_proj_out_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[19].original_fqn = "mid.attn_1.proj_out.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].name = "mid_block_2_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[20].original_fqn = "mid.block_2.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].name = "mid_block_2_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[21].original_fqn = "mid.block_2.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].name = "mid_block_2_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[22].original_fqn = "mid.block_2.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].name = "mid_block_2_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[23].original_fqn = "mid.block_2.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].name = "mid_block_2_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[24].original_fqn = "mid.block_2.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].name = "mid_block_2_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[25].original_fqn = "mid.block_2.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].name = "mid_block_2_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[26].original_fqn = "mid.block_2.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].name = "mid_block_2_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[27].original_fqn = "mid.block_2.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].name = "up_0_block_0_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[28].original_fqn = "up.0.block.0.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].name = "up_0_block_0_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[29].original_fqn = "up.0.block.0.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].name = "up_0_block_0_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].data_size = 1179648;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].shape = {128, 256, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].stride = {2304, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[30].original_fqn = "up.0.block.0.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].name = "up_0_block_0_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[31].original_fqn = "up.0.block.0.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].name = "up_0_block_0_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[32].original_fqn = "up.0.block.0.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].name = "up_0_block_0_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[33].original_fqn = "up.0.block.0.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].name = "up_0_block_0_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].data_size = 589824;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].shape = {128, 128, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].stride = {1152, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[34].original_fqn = "up.0.block.0.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].name = "up_0_block_0_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[35].original_fqn = "up.0.block.0.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].name = "up_0_block_0_nin_shortcut_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].data_size = 131072;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].shape = {128, 256, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].stride = {256, 1, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[36].original_fqn = "up.0.block.0.nin_shortcut.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].name = "up_0_block_0_nin_shortcut_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[37].original_fqn = "up.0.block.0.nin_shortcut.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].name = "up_0_block_1_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[38].original_fqn = "up.0.block.1.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].name = "up_0_block_1_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[39].original_fqn = "up.0.block.1.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].name = "up_0_block_1_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].data_size = 589824;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].shape = {128, 128, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].stride = {1152, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[40].original_fqn = "up.0.block.1.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].name = "up_0_block_1_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[41].original_fqn = "up.0.block.1.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].name = "up_0_block_1_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[42].original_fqn = "up.0.block.1.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].name = "up_0_block_1_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[43].original_fqn = "up.0.block.1.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].name = "up_0_block_1_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].data_size = 589824;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].shape = {128, 128, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].stride = {1152, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[44].original_fqn = "up.0.block.1.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].name = "up_0_block_1_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[45].original_fqn = "up.0.block.1.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].name = "up_0_block_2_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[46].original_fqn = "up.0.block.2.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].name = "up_0_block_2_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[47].original_fqn = "up.0.block.2.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].name = "up_0_block_2_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].data_size = 589824;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].shape = {128, 128, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].stride = {1152, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[48].original_fqn = "up.0.block.2.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].name = "up_0_block_2_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[49].original_fqn = "up.0.block.2.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].name = "up_0_block_2_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[50].original_fqn = "up.0.block.2.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].name = "up_0_block_2_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[51].original_fqn = "up.0.block.2.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].name = "up_0_block_2_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].data_size = 589824;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].shape = {128, 128, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].stride = {1152, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[52].original_fqn = "up.0.block.2.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].name = "up_0_block_2_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[53].original_fqn = "up.0.block.2.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].name = "up_1_block_0_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[54].original_fqn = "up.1.block.0.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].name = "up_1_block_0_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[55].original_fqn = "up.1.block.0.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].name = "up_1_block_0_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].data_size = 4718592;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].shape = {256, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[56].original_fqn = "up.1.block.0.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].name = "up_1_block_0_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[57].original_fqn = "up.1.block.0.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].name = "up_1_block_0_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[58].original_fqn = "up.1.block.0.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].name = "up_1_block_0_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[59].original_fqn = "up.1.block.0.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].name = "up_1_block_0_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].data_size = 2359296;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].shape = {256, 256, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].stride = {2304, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[60].original_fqn = "up.1.block.0.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].name = "up_1_block_0_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[61].original_fqn = "up.1.block.0.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].name = "up_1_block_0_nin_shortcut_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].data_size = 524288;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].shape = {256, 512, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].stride = {512, 1, 1, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[62].original_fqn = "up.1.block.0.nin_shortcut.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].name = "up_1_block_0_nin_shortcut_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[63].original_fqn = "up.1.block.0.nin_shortcut.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].name = "up_1_block_1_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[64].original_fqn = "up.1.block.1.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].name = "up_1_block_1_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[65].original_fqn = "up.1.block.1.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].name = "up_1_block_1_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].data_size = 2359296;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].shape = {256, 256, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].stride = {2304, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[66].original_fqn = "up.1.block.1.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].name = "up_1_block_1_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[67].original_fqn = "up.1.block.1.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].name = "up_1_block_1_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[68].original_fqn = "up.1.block.1.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].name = "up_1_block_1_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[69].original_fqn = "up.1.block.1.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].name = "up_1_block_1_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].data_size = 2359296;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].shape = {256, 256, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].stride = {2304, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[70].original_fqn = "up.1.block.1.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].name = "up_1_block_1_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[71].original_fqn = "up.1.block.1.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].name = "up_1_block_2_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[72].original_fqn = "up.1.block.2.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].name = "up_1_block_2_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[73].original_fqn = "up.1.block.2.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].name = "up_1_block_2_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].data_size = 2359296;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].shape = {256, 256, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].stride = {2304, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[74].original_fqn = "up.1.block.2.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].name = "up_1_block_2_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[75].original_fqn = "up.1.block.2.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].name = "up_1_block_2_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[76].original_fqn = "up.1.block.2.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].name = "up_1_block_2_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[77].original_fqn = "up.1.block.2.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].name = "up_1_block_2_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].data_size = 2359296;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].shape = {256, 256, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].stride = {2304, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[78].original_fqn = "up.1.block.2.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].name = "up_1_block_2_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[79].original_fqn = "up.1.block.2.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].name = "up_1_upsample_conv_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].data_size = 2359296;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].shape = {256, 256, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].stride = {2304, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[80].original_fqn = "up.1.upsample.conv.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].name = "up_1_upsample_conv_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].data_size = 1024;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].shape = {256};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[81].original_fqn = "up.1.upsample.conv.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].name = "up_2_block_0_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[82].original_fqn = "up.2.block.0.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].name = "up_2_block_0_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[83].original_fqn = "up.2.block.0.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].name = "up_2_block_0_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[84].original_fqn = "up.2.block.0.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].name = "up_2_block_0_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[85].original_fqn = "up.2.block.0.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].name = "up_2_block_0_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[86].original_fqn = "up.2.block.0.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].name = "up_2_block_0_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[87].original_fqn = "up.2.block.0.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].name = "up_2_block_0_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[88].original_fqn = "up.2.block.0.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].name = "up_2_block_0_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[89].original_fqn = "up.2.block.0.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].name = "up_2_block_1_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[90].original_fqn = "up.2.block.1.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].name = "up_2_block_1_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[91].original_fqn = "up.2.block.1.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].name = "up_2_block_1_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[92].original_fqn = "up.2.block.1.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].name = "up_2_block_1_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[93].original_fqn = "up.2.block.1.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].name = "up_2_block_1_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[94].original_fqn = "up.2.block.1.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].name = "up_2_block_1_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[95].original_fqn = "up.2.block.1.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].name = "up_2_block_1_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[96].original_fqn = "up.2.block.1.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].name = "up_2_block_1_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[97].original_fqn = "up.2.block.1.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].name = "up_2_block_2_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[98].original_fqn = "up.2.block.2.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].name = "up_2_block_2_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[99].original_fqn = "up.2.block.2.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].name = "up_2_block_2_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[100].original_fqn = "up.2.block.2.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].name = "up_2_block_2_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[101].original_fqn = "up.2.block.2.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].name = "up_2_block_2_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[102].original_fqn = "up.2.block.2.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].name = "up_2_block_2_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[103].original_fqn = "up.2.block.2.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].name = "up_2_block_2_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[104].original_fqn = "up.2.block.2.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].name = "up_2_block_2_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[105].original_fqn = "up.2.block.2.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].name = "up_2_upsample_conv_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[106].original_fqn = "up.2.upsample.conv.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].name = "up_2_upsample_conv_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[107].original_fqn = "up.2.upsample.conv.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].name = "up_3_block_0_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[108].original_fqn = "up.3.block.0.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].name = "up_3_block_0_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[109].original_fqn = "up.3.block.0.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].name = "up_3_block_0_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[110].original_fqn = "up.3.block.0.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].name = "up_3_block_0_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[111].original_fqn = "up.3.block.0.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].name = "up_3_block_0_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[112].original_fqn = "up.3.block.0.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].name = "up_3_block_0_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[113].original_fqn = "up.3.block.0.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].name = "up_3_block_0_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[114].original_fqn = "up.3.block.0.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].name = "up_3_block_0_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[115].original_fqn = "up.3.block.0.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].name = "up_3_block_1_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[116].original_fqn = "up.3.block.1.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].name = "up_3_block_1_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[117].original_fqn = "up.3.block.1.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].name = "up_3_block_1_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[118].original_fqn = "up.3.block.1.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].name = "up_3_block_1_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[119].original_fqn = "up.3.block.1.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].name = "up_3_block_1_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[120].original_fqn = "up.3.block.1.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].name = "up_3_block_1_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[121].original_fqn = "up.3.block.1.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].name = "up_3_block_1_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[122].original_fqn = "up.3.block.1.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].name = "up_3_block_1_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[123].original_fqn = "up.3.block.1.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].name = "up_3_block_2_norm1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[124].original_fqn = "up.3.block.2.norm1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].name = "up_3_block_2_norm1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[125].original_fqn = "up.3.block.2.norm1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].name = "up_3_block_2_conv1_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[126].original_fqn = "up.3.block.2.conv1.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].name = "up_3_block_2_conv1_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[127].original_fqn = "up.3.block.2.conv1.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].name = "up_3_block_2_norm2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[128].original_fqn = "up.3.block.2.norm2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].name = "up_3_block_2_norm2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[129].original_fqn = "up.3.block.2.norm2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].name = "up_3_block_2_conv2_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[130].original_fqn = "up.3.block.2.conv2.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].name = "up_3_block_2_conv2_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[131].original_fqn = "up.3.block.2.conv2.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].name = "up_3_upsample_conv_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].data_size = 9437184;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].shape = {512, 512, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].stride = {4608, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[132].original_fqn = "up.3.upsample.conv.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].name = "up_3_upsample_conv_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].data_size = 2048;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].shape = {512};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[133].original_fqn = "up.3.upsample.conv.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].name = "norm_out_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[134].original_fqn = "norm_out.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].name = "norm_out_bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].data_size = 512;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].shape = {128};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].stride = {1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[135].original_fqn = "norm_out.bias";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].name = "conv_out_weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].offset = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].data_size = 13824;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].from_folded = false;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].shape = {3, 128, 3, 3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].stride = {1152, 9, 3, 1};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].layout = static_cast<int32_t>(cached_torch_layout_strided);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     constants_info_[136].original_fqn = "conv_out.weight";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     update_constants_map(std::move(constants_map));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     update_constants_array(std::move(constants_array));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     in_spec_ = "[1, {\"type\": \"builtins.tuple\", \"context\": \"null\", \"children_spec\": [{\"type\": \"builtins.tuple\", \"context\": \"null\", \"children_spec\": [{\"type\": null, \"context\": null, \"children_spec\": []}]}, {\"type\": \"builtins.dict\", \"context\": \"[]\", \"children_spec\": []}]}]";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     out_spec_ = "[1, {\"type\": null, \"context\": null, \"children_spec\": []}]";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     outputs_info_[0].name = "output0";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     this->kernels_ = std::make_unique<AOTInductorModelKernels>();
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     DeviceStreamType stream,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     AOTIProxyExecutorHandle proxy_executor,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     bool initialization
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (!initialization) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                   << "aot_inductor.use_runtime_constant_folding=False\n";
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     return {};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } // namespace torch::aot_inductor
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] using namespace torch::aot_inductor;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_0(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_0', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 16384},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_0 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_0 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cgxxjgxrusrntatjx5abwxgn54didxlre2pjfyn2zvpdmi3ef4jq.cubin", "triton_poi_fused__to_copy_convolution_0", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_2 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_3 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_0, &var_1, &var_2, &global_scratch_3};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_0, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_1(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_1', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 131072},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 73728
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_1 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_1 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c7nh2ovduspc5ybbtrsjalmrhgkv3sixfefklqe2uqflq32po54w.cubin", "triton_poi_fused__to_copy_convolution_1", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_4 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_6 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_7 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_4, &var_5, &var_6, &global_scratch_7};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_1, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_2(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_2', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 512},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 512
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_2 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_2 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c3evy4rcaln7lqsl6xndoppq5vawyf6l7pleyr7qzmlan2s2lcbk.cubin", "triton_poi_fused__to_copy_convolution_2", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_8 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_9 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_10 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_11 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_8, &var_9, &var_10, &global_scratch_11};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_2, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_3(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_3', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 64, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_3(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 64
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (ks0*ks1*((((r0_2 + 8*ks0*ks1*x0) // (ks0*ks1)) % 16)) + 16*ks0*ks1*x1 + ((((r0_2 % (ks0*ks1))) % (ks0*ks1)))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks0*ks1*x0) // (ks0*ks1)) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp9[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp6, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp7, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_3 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_3 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cw5dqtidbmmszjcofl4rzmbldngnokquurk3ckhqfx4yk3o6rttr.cubin", "triton_red_fused_native_group_norm_3", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_12 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_13 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_14 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_15 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_16 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_17 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_18 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_19 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_20 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_21 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_12, &var_13, &var_14, &var_15, &var_16, &var_17, &var_18, &var_19, &var_20, &global_scratch_21};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_3, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_4(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_per_fused_native_group_norm_4', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.persistent_reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 32, 'r0_': 2},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_per_fused_native_group_norm_4(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 32
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_numel = 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         R0_BLOCK: tl.constexpr = 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_index = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_offset = 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_1 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1 + 2*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (r0_1 + 2*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr2 + (r0_1 + 2*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.where(xmask, tmp3, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.where(xmask, tmp4, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tl.where(xmask, tmp5, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tmp11[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp13, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x0), tmp14, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (8 - 1)) / (8));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_per_fused_native_group_norm_4 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_per_fused_native_group_norm_4 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cyf5aory652x6536phtdflvp5a6acfvciiqgqz3ueruoebzgxtng.cubin", "triton_per_fused_native_group_norm_4", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_22 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_23 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_24 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_25 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_26 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_27 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_28 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_29 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_22, &var_23, &var_24, &var_25, &var_26, &var_27, &var_28, &global_scratch_29};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_per_fused_native_group_norm_4, grid_0, grid_1, grid_2, 2, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 524288},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 - tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = 16*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp7.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = (tmp6 / tmp8)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp9 + tmp10
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = libdevice.rsqrt(tmp11)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp5 * tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 * tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 + tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.sigmoid(tmp17)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 * tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tmp19.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp20, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c6v4bvys5ewmltqf3ivtpd7jhmo6spoliy2aeunfsje7jgwnsmw3.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_30 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_31 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_32 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_33 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_34 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_35 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_36 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_37 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_38 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_39 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_40 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_41 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_30, &var_31, &var_32, &var_33, &var_34, &var_35, &var_36, &var_37, &var_38, &var_39, &var_40, &global_scratch_41};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_6(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_6', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 4194304},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_sigmoid_6(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 2359296
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_6 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_6 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cqon4fmhenlp4qvhco6q5yusdakrngw3zldxf5reydmahju5yr5g.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_6", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_42 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_43 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_44 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_45 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_42, &var_43, &var_44, &global_scratch_45};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_6, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_7(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_7', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 64, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_7(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 64
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp9[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp6, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp7, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_7 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_7 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ctgaufphdl4ydfbxralbmsuk6x5y5iapr4cfau4wnsvalvze5gzv.cubin", "triton_red_fused_native_group_norm_7", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_46 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_47 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_48 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_49 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_50 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_51 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_52 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_53 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_54 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_55 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_56 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_46, &var_47, &var_48, &var_49, &var_50, &var_51, &var_52, &var_53, &var_54, &var_55, &global_scratch_56};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_7, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_8(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_8', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 64, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_8(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 64
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tl.load(in_ptr2 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tl.load(in_ptr3 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7 = tmp6.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_mean_next, tmp9_m2_next, tmp9_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp8, tmp9_mean, tmp9_m2, tmp9_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_mean = tl.where(r0_mask & xmask, tmp9_mean_next, tmp9_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_m2 = tl.where(r0_mask & xmask, tmp9_m2_next, tmp9_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_weight = tl.where(r0_mask & xmask, tmp9_weight_next, tmp9_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12, tmp13, tmp14 = triton_helpers.welford(tmp9_mean, tmp9_m2, tmp9_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp13[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp14[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp9, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp10, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp11, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_8 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_8 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c7dgp7owleeb43ezkfyawptppgcsfmwynfgsbo2qqvot2g3ckueb.cubin", "triton_red_fused_native_group_norm_8", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_57 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_58 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_59 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_60 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_61 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_62 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_63 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_64 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_65 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_66 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_67 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_68 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_69 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_57, &var_58, &var_59, &var_60, &var_61, &var_62, &var_63, &var_64, &var_65, &var_66, &var_67, &var_68, &global_scratch_69};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_8, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename in_ptr7_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename out_ptr3_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_native_group_norm_9(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr6_type_& in_ptr6,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr7_type_& in_ptr7,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr3_type_& out_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_native_group_norm_9', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 524288},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'in_ptr7': '*fp32', 'out_ptr1': '*bf16', 'out_ptr2': '*bf16', 'out_ptr3': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_native_group_norm_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_native_group_norm_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr1, out_ptr2, out_ptr3, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.load(in_ptr2 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tl.load(in_ptr5 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tl.load(in_ptr7 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp6.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 - tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = 16*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = tmp11.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = (tmp10 / tmp12)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 + tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = libdevice.rsqrt(tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp9 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 * tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp21 = tmp19 + tmp20
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp22 = tmp21.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp22, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x2), tmp22, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr3 + (x2), tmp22, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_native_group_norm_9 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_native_group_norm_9 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/coanhxisptitfdasawvc6iwnfch3gcc4eivcrjds6y26a7jgxjzb.cubin", "triton_poi_fused__to_copy_convolution_native_group_norm_9", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_70 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_71 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_72 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_73 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_74 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_75 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_76 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_77 = reinterpret_cast<CUdeviceptr>(in_ptr7.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_78 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_79 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_80 = reinterpret_cast<CUdeviceptr>(out_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_81 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_82 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_83 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_84 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_85 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_70, &var_71, &var_72, &var_73, &var_74, &var_75, &var_76, &var_77, &var_78, &var_79, &var_80, &var_81, &var_82, &var_83, &var_84, &global_scratch_85};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_native_group_norm_9, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_10(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_10', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 262144},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_10', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_10(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 262144
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_10 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_10 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cyxtqg3wbaqtjvvsnpm76nco2oasofkesqzf2e6w6l2v3b34xu4t.cubin", "triton_poi_fused__to_copy_convolution_10", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_86 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_87 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_88 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_89 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_86, &var_87, &var_88, &global_scratch_89};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_10, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__scaled_dot_product_efficient_attention_clone_11(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ynumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__scaled_dot_product_efficient_attention_clone_11', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'y': 1024, 'x': 512}, tile_hint=TileHint.DEFAULT,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid2DWithYZOverflow', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_efficient_attention_clone_11', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__scaled_dot_product_efficient_attention_clone_11(in_ptr0, in_ptr1, out_ptr0, ks0, ks1, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 512
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         ymask = yindex < ynumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         y0 = yindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (y0 + ks0*ks1*x1), ymask & xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x1 + 512*y0), tmp2, ymask & xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t y_grid_raw_ = ((ynumel + (32 - 1)) / (32));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t y_grid_div_ = ((y_grid_raw_ + (65535 - 1)) / (65535));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = ((y_grid_raw_ + (y_grid_div_ - 1)) / (y_grid_div_));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = y_grid_div_;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__scaled_dot_product_efficient_attention_clone_11 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__scaled_dot_product_efficient_attention_clone_11 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnb6p6jfugm6m5kzhvoewfhdoxxnhuuq7776ulkafif652n3zfcz.cubin", "triton_poi_fused__scaled_dot_product_efficient_attention_clone_11", 2560, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_90 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_91 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_92 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_93 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_94 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_95 = ynumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_96 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_97 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_90, &var_91, &var_92, &var_93, &var_94, &var_95, &var_96, &global_scratch_97};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__scaled_dot_product_efficient_attention_clone_11, grid_0, grid_1, grid_2, 4, 2560, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_out_ptr0_type_& in_out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ynumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'y': 512, 'x': 1024}, tile_hint=TileHint.DEFAULT,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, ks0, ks1, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         ynumel = 512
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         yoffset = tl.program_id(1) * YBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         ymask = yindex < ynumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         y0 = yindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_out_ptr0 + (x1 + ks0*ks1*y0), ymask & xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr0 + (y0), ymask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.load(in_ptr1 + (x1 + ks0*ks1*y0), ymask & xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (y0), ymask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.load(in_ptr3 + (y0 + 512*x1), ymask & xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (y0), ymask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 + tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp6 + tmp9
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.debug_barrier()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(in_out_ptr0 + (x1 + ks0*ks1*y0), tmp10, ymask & xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = ((ynumel + (32 - 1)) / (32));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnyojbbwpuuzbl5nng3ndkyl6wdh2u2xssgt5rq6mhiboqnmpmmt.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12", 2112, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_98 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_99 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_100 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_101 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_102 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_103 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_104 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_105 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_106 = ynumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_107 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_108 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_98, &var_99, &var_100, &var_101, &var_102, &var_103, &var_104, &var_105, &var_106, &var_107, &global_scratch_108};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12, grid_0, grid_1, grid_2, 4, 2112, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_13(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_13', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 64, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_13', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_13(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 64
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp6[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp7[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp3, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp4, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_13 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_13 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c3wyogxxlvfs3mp5obxu24c73ptc3pl5fr5npe5sqhfjl2vuh3fy.cubin", "triton_red_fused_native_group_norm_13", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_109 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_110 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_111 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_112 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_113 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_114 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_115 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_116 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_117 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_118 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_109, &var_110, &var_111, &var_112, &var_113, &var_114, &var_115, &var_116, &var_117, &global_scratch_118};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_13, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 524288},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr1 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp1 - tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = 16*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp5.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = (tmp4 / tmp6)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 + tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = libdevice.rsqrt(tmp9)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp3 * tmp10
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp11 * tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 + tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.sigmoid(tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tmp17.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp18, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ce66l2cfwtkucstdimfknnfvgcopcdq2vgdfqfu4f5yh4buok6mb.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_119 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_120 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_121 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_122 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_123 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_124 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_125 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_126 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_127 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_128 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_129 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_119, &var_120, &var_121, &var_122, &var_123, &var_124, &var_125, &var_126, &var_127, &var_128, &global_scratch_129};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_15(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_15', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 64, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_15', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_15(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 64
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tl.load(in_ptr2 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tmp1 + tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tmp0 + tmp3
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5 = tmp4.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp11[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp7, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp8, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp9, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_15 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_15 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnqjga7ngpvi4wizwxmrd26bprns73moojxkckff3lwjafghffbb.cubin", "triton_red_fused_native_group_norm_15", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_130 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_131 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_132 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_133 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_134 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_135 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_136 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_137 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_138 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_139 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_140 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_141 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_130, &var_131, &var_132, &var_133, &var_134, &var_135, &var_136, &var_137, &var_138, &var_139, &var_140, &global_scratch_141};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_15, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr6_type_& in_ptr6,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 524288},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 7, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp1 + tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp0 + tmp3
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp4.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp5 - tmp6
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = 16*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp9.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = (tmp8 / tmp10)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp11 + tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = libdevice.rsqrt(tmp13)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp7 * tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 + tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tl.sigmoid(tmp19)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp21 = tmp19 * tmp20
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp22 = tmp21.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp22, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cerzvyddgk6xjjm6rnz7xwp3oibvn5tgw6pbshclafd2olobrl6i.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_142 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_143 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_144 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_145 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_146 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_147 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_148 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_149 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_150 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_151 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_152 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_153 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_154 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_142, &var_143, &var_144, &var_145, &var_146, &var_147, &var_148, &var_149, &var_150, &var_151, &var_152, &var_153, &global_scratch_154};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 524288},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*fp32', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tl.load(in_ptr3 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp1 + tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp0 + tmp3
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp4.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp6 + tmp7
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp8.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp5 + tmp9
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x2), tmp10, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cc5fomvsznhjfks2obdqhntif5ulal7zx3dlnqxgptc55tazlzp6.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17", 4096, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_155 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_156 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_157 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_158 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_159 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_160 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_161 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_162 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_163 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_155, &var_156, &var_157, &var_158, &var_159, &var_160, &var_161, &var_162, &global_scratch_163};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17, grid_0, grid_1, grid_2, 4, 4096, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_18(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_18', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 64, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_18', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_18(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 64
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((r0_2 % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2_mean_next, tmp2_m2_next, tmp2_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp1, tmp2_mean, tmp2_m2, tmp2_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2_mean = tl.where(r0_mask & xmask, tmp2_mean_next, tmp2_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2_m2 = tl.where(r0_mask & xmask, tmp2_m2_next, tmp2_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2_weight = tl.where(r0_mask & xmask, tmp2_weight_next, tmp2_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5, tmp6, tmp7 = triton_helpers.welford(tmp2_mean, tmp2_m2, tmp2_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp5[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp6[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp7[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp2, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp3, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp4, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_18 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_18 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c672ewfhywvvjvvqmyycaeh4n67pdgbznw5zobkheaov2niu4irf.cubin", "triton_red_fused_native_group_norm_18", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_164 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_165 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_166 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_167 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_168 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_169 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_170 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_171 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_172 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_173 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_164, &var_165, &var_166, &var_167, &var_168, &var_169, &var_170, &var_171, &var_172, &global_scratch_173};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_18, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 524288},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 - tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = 16*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp4.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = (tmp3 / tmp5)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp6 + tmp7
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = libdevice.rsqrt(tmp8)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp2 * tmp9
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = tmp10 * tmp11
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tmp12 + tmp13
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tl.sigmoid(tmp14)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tmp14 * tmp15
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp16.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp17, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cshbdjfifm35yvmn4qp3vxv2imemcdoeytxtqrtfwba6wavhq6ze.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_174 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_175 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_176 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_177 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_178 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_179 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_180 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_181 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_182 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_183 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_184 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_174, &var_175, &var_176, &var_177, &var_178, &var_179, &var_180, &var_181, &var_182, &var_183, &global_scratch_184};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_20(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_20', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 64, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_20', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_20(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 64
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tl.load(in_ptr2 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tmp1 + tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tmp3.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5 = tmp0 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp11[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp7, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp8, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp9, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_20 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_20 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/crfju4ylwfirzczakk4i75xxohp6oblrrdokawkmezmkq6dldhnn.cubin", "triton_red_fused_native_group_norm_20", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_185 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_186 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_187 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_188 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_189 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_190 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_191 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_192 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_193 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_194 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_195 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_196 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_185, &var_186, &var_187, &var_188, &var_189, &var_190, &var_191, &var_192, &var_193, &var_194, &var_195, &global_scratch_196};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_20, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr6_type_& in_ptr6,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 524288},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 7, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp1 + tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp3.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp0 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp5 - tmp6
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = 16*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp9.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = (tmp8 / tmp10)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp11 + tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = libdevice.rsqrt(tmp13)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp7 * tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 + tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tl.sigmoid(tmp19)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp21 = tmp19 * tmp20
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp22 = tmp21.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp22, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cxqg5i22geotiyh56e6jwtjx7fxpr3dgsemhzymjf63gxuiwjyjz.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_197 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_198 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_199 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_200 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_201 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_202 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_203 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_204 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_205 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_206 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_207 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_208 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_209 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_197, &var_198, &var_199, &var_200, &var_201, &var_202, &var_203, &var_204, &var_205, &var_206, &var_207, &var_208, &global_scratch_209};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 2097152},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'ks4': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, ks0, ks1, ks2, ks3, ks4, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = ((xindex // ks1) % ks2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % ks1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex // ks4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp22 = tl.load(in_ptr2 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp27 = tl.load(in_ptr4 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = ((ks0.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks0.to(tl.float64)))
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = x1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp3 * tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp4.to(tl.int64)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp5 + tmp6
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp5 < 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tl.where(tmp8, tmp7, tmp5)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = ((ks3.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks3.to(tl.float64)))
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp10.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = x0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp12.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tmp13 * tmp11
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp14.to(tl.int64)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = ks3
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 + tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tmp15 < 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tl.where(tmp18, tmp17, tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tl.load(in_ptr0 + (tmp19 + ks3*tmp9 + ks0*ks3*x2), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp21 = tl.load(in_ptr1 + (tmp19 + ks3*tmp9 + ks0*ks3*x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp23 = tmp21 + tmp22
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp24 = tmp23.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp25 = tmp20 + tmp24
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp26 = tl.load(in_ptr3 + (tmp19 + ks3*tmp9 + ks0*ks3*x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp28 = tmp26 + tmp27
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp29 = tmp28.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp30 = tmp25 + tmp29
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp31 = tmp30.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp31, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c74i6lod2nm4yqm3gxjie5sdqhab5gsy4nw7acrdg2qn7c7toe4o.cubin", "triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_210 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_211 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_212 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_213 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_214 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_215 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_216 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_217 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_218 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_219 = ks3;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_220 = ks4;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_221 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_222 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_210, &var_211, &var_212, &var_213, &var_214, &var_215, &var_216, &var_217, &var_218, &var_219, &var_220, &var_221, &global_scratch_222};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_23(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_23', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 256, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_23', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_23(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 256
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 8)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp9[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp6, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp7, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_23 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_23 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ctwlo73csmgu7xibbxfmsczvr3pg3v7iekhys6dl4ijgd6z3rkgw.cubin", "triton_red_fused_native_group_norm_23", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_223 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_224 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_225 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_226 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_227 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_228 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_229 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_230 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_231 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_232 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_233 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_223, &var_224, &var_225, &var_226, &var_227, &var_228, &var_229, &var_230, &var_231, &var_232, &global_scratch_233};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_23, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_24(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_per_fused_native_group_norm_24', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.persistent_reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 32, 'r0_': 8},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_24', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_per_fused_native_group_norm_24(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 32
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_numel = 8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         R0_BLOCK: tl.constexpr = 8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_index = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_offset = 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_1 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1 + 8*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (r0_1 + 8*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr2 + (r0_1 + 8*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.where(xmask, tmp3, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.where(xmask, tmp4, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tl.where(xmask, tmp5, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tmp11[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp13, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x0), tmp14, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (8 - 1)) / (8));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_per_fused_native_group_norm_24 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_per_fused_native_group_norm_24 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cabjefmhsq4nvbq7xbsodpebbztvkrtwuhhrskefvh3uinxa2awr.cubin", "triton_per_fused_native_group_norm_24", 32, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_234 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_235 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_236 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_237 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_238 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_239 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_240 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_241 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_234, &var_235, &var_236, &var_237, &var_238, &var_239, &var_240, &global_scratch_241};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_per_fused_native_group_norm_24, grid_0, grid_1, grid_2, 2, 32, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 2097152},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 - tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = 64*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp7.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = (tmp6 / tmp8)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp9 + tmp10
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = libdevice.rsqrt(tmp11)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp5 * tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 * tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 + tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.sigmoid(tmp17)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 * tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tmp19.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp20, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ces3hf5d632ipoxxctusejp5vs7esgaw5i3i6rq3r5mb3zvgooya.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_242 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_243 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_244 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_245 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_246 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_247 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_248 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_249 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_250 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_251 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_252 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_253 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_242, &var_243, &var_244, &var_245, &var_246, &var_247, &var_248, &var_249, &var_250, &var_251, &var_252, &global_scratch_253};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_26(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_26', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 256, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_26', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_26(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 256
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 8)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tl.load(in_ptr2 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tl.load(in_ptr3 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7 = tmp6.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_mean_next, tmp9_m2_next, tmp9_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp8, tmp9_mean, tmp9_m2, tmp9_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_mean = tl.where(r0_mask & xmask, tmp9_mean_next, tmp9_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_m2 = tl.where(r0_mask & xmask, tmp9_m2_next, tmp9_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_weight = tl.where(r0_mask & xmask, tmp9_weight_next, tmp9_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12, tmp13, tmp14 = triton_helpers.welford(tmp9_mean, tmp9_m2, tmp9_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp13[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp14[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp9, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp10, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp11, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_26 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_26 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/csusruwdq3ewpec2mijb54kievycqaj2fywnsptmdxjebzj7wfg5.cubin", "triton_red_fused_native_group_norm_26", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_254 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_255 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_256 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_257 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_258 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_259 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_260 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_261 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_262 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_263 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_264 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_265 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_266 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_254, &var_255, &var_256, &var_257, &var_258, &var_259, &var_260, &var_261, &var_262, &var_263, &var_264, &var_265, &global_scratch_266};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_26, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename in_ptr7_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr6_type_& in_ptr6,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr7_type_& in_ptr7,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 2097152},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'in_ptr7': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.load(in_ptr2 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tl.load(in_ptr5 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tl.load(in_ptr7 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp6.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 - tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = 64*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = tmp11.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = (tmp10 / tmp12)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 + tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = libdevice.rsqrt(tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp9 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 * tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp21 = tmp19 + tmp20
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp22 = tl.sigmoid(tmp21)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp23 = tmp21 * tmp22
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp24 = tmp23.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp24, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cerpylu2mm64w35oojz7f34d725iijez6ymut46gsocdftasf4sn.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_267 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_268 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_269 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_270 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_271 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_272 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_273 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_274 = reinterpret_cast<CUdeviceptr>(in_ptr7.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_275 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_276 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_277 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_278 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_279 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_280 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_267, &var_268, &var_269, &var_270, &var_271, &var_272, &var_273, &var_274, &var_275, &var_276, &var_277, &var_278, &var_279, &global_scratch_280};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_out_ptr0_type_& in_out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 2097152},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, ks0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_out_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr0 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.load(in_ptr3 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 + tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp6 + tmp9
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(in_out_ptr0 + (x2), tmp10, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cna7x4qz26ppazei4h75gxjil3phwhoamgemi6f3q6hyl2pwqoyu.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_281 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_282 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_283 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_284 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_285 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_286 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_287 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_288 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_289 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_281, &var_282, &var_283, &var_284, &var_285, &var_286, &var_287, &var_288, &global_scratch_289};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_29(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_29', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 256, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_29', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_29(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 256
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 8)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp6[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp7[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp3, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp4, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_29 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_29 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cagmz5pdlam2a7g56gbnkt5qx7bz5w6ehdm5536i3o4henvtc4nc.cubin", "triton_red_fused_native_group_norm_29", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_290 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_291 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_292 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_293 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_294 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_295 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_296 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_297 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_298 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_299 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_290, &var_291, &var_292, &var_293, &var_294, &var_295, &var_296, &var_297, &var_298, &global_scratch_299};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_29, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 2097152},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr1 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp1 - tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = 64*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp5.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = (tmp4 / tmp6)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 + tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = libdevice.rsqrt(tmp9)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp3 * tmp10
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp11 * tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 + tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.sigmoid(tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tmp17.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp18, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cyr6hjuydoo47hk56ee2zmceyv5eaguqbpn4xnxhkvswp6kxusnv.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_300 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_301 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_302 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_303 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_304 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_305 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_306 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_307 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_308 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_309 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_310 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_300, &var_301, &var_302, &var_303, &var_304, &var_305, &var_306, &var_307, &var_308, &var_309, &global_scratch_310};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks6,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 8388608},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'ks4': 'i32', 'ks5': 'i32', 'ks6': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = ((xindex // ks1) % ks2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % ks1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex // ks4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp22 = tl.load(in_ptr2 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = ((ks0.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks0.to(tl.float64)))
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = x1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp3 * tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp4.to(tl.int64)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp5 + tmp6
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp5 < 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tl.where(tmp8, tmp7, tmp5)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = ((ks3.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks3.to(tl.float64)))
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp10.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = x0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp12.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tmp13 * tmp11
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp14.to(tl.int64)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = ks3
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 + tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tmp15 < 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tl.where(tmp18, tmp17, tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tl.load(in_ptr0 + (tmp19 + 2*ks5*tmp9 + 4*ks5*ks6*x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp21 = tl.load(in_ptr1 + (tmp19 + 2*ks5*tmp9 + 4*ks5*ks6*x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp23 = tmp21 + tmp22
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp24 = tmp20 + tmp23
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp25 = tmp24.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp26 = tmp25.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp26, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ciatxcv44lyh4todkaep4o6twfjxyjyn7eo5vk3cd64g3zsq7r5g.cubin", "triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_311 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_312 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_313 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_314 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_315 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_316 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_317 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_318 = ks3;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_319 = ks4;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_320 = ks5;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_321 = ks6;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_322 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_323 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_311, &var_312, &var_313, &var_314, &var_315, &var_316, &var_317, &var_318, &var_319, &var_320, &var_321, &var_322, &global_scratch_323};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_out_ptr0_type_& in_out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 8388608},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32(in_out_ptr0, in_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_out_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(in_out_ptr0 + (x2), tmp2, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c72tmv5oguk4xfujev6a5fibkw3pttym5ffhzintzl3ribyzm45t.cubin", "triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_324 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_325 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_326 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_327 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_328 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_324, &var_325, &var_326, &var_327, &global_scratch_328};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_33(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_33', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 1024, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_33', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_33(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 1024
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 32
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 256*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp6[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp7[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp3, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp4, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_33 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_33 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/clzfhzbtnqqa2q6zxs6izbsa4jom7ty3fckozqmq7ut4kk4n2hez.cubin", "triton_red_fused_native_group_norm_33", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_329 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_330 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_331 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_332 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_333 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_334 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_335 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_336 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_337 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_338 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_329, &var_330, &var_331, &var_332, &var_333, &var_334, &var_335, &var_336, &var_337, &global_scratch_338};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_33, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_34(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_per_fused_native_group_norm_34', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.persistent_reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 32, 'r0_': 32},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_34', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_per_fused_native_group_norm_34(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 32
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_numel = 32
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         R0_BLOCK: tl.constexpr = 32
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_index = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_offset = 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_1 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1 + 32*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (r0_1 + 32*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr2 + (r0_1 + 32*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.where(xmask, tmp3, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.where(xmask, tmp4, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tl.where(xmask, tmp5, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tmp11[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp13, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x0), tmp14, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (8 - 1)) / (8));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_per_fused_native_group_norm_34 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_per_fused_native_group_norm_34 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cp3mlrwytqztypeekbp62tyad6csd27igkrisuwn7urgixbsnts6.cubin", "triton_per_fused_native_group_norm_34", 32, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_339 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_340 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_341 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_342 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_343 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_344 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_345 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_346 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_339, &var_340, &var_341, &var_342, &var_343, &var_344, &var_345, &global_scratch_346};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_per_fused_native_group_norm_34, grid_0, grid_1, grid_2, 2, 32, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 8388608},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr1 + (x1 // 16), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1 // 16), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp1 - tmp2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = 256*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp5.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = (tmp4 / tmp6)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 + tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = libdevice.rsqrt(tmp9)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp3 * tmp10
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp11 * tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 + tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.sigmoid(tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tmp17.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp18, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cd6ovmq3a3hzsgvwyxirdqddks7xg2nquokhfglxjvhglsiuzfpk.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_347 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_348 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_349 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_350 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_351 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_352 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_353 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_354 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_355 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_356 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_357 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_347, &var_348, &var_349, &var_350, &var_351, &var_352, &var_353, &var_354, &var_355, &var_356, &global_scratch_357};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_36(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_36', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 2097152},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_36', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_sigmoid_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 1179648
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_36 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_36 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cdunjhkwacuurik4z6ixf3swngb2dtnqggz5b3wdvcqjkl3wbuif.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_36", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_358 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_359 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_360 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_361 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_358, &var_359, &var_360, &global_scratch_361};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_36, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_37(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_37', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 256},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_37', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_sigmoid_37(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 256
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_37 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_37 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ctgmkxd56s5k6dn3ye32ivg6jp2fdiunnkcrsiri6f2uvy4wmgsz.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_37", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_362 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_363 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_364 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_365 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_362, &var_363, &var_364, &global_scratch_365};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_37, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_38(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_38', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 512, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_38', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_38(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 512
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 16)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (8*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp9[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp6, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp7, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_38 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_38 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/csdvwyo6raadjgh5agexkdti5gc7kskvlb2cc5nh3g2foo2s43d3.cubin", "triton_red_fused_native_group_norm_38", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_366 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_367 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_368 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_369 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_370 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_371 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_372 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_373 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_374 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_375 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_376 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_366, &var_367, &var_368, &var_369, &var_370, &var_371, &var_372, &var_373, &var_374, &var_375, &global_scratch_376};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_38, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_39(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_per_fused_native_group_norm_39', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.persistent_reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 32, 'r0_': 16},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_39', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_per_fused_native_group_norm_39(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 32
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_numel = 16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         R0_BLOCK: tl.constexpr = 16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_index = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_offset = 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_1 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1 + 16*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (r0_1 + 16*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tl.load(in_ptr2 + (r0_1 + 16*x0), xmask, other=0.0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.where(xmask, tmp3, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.where(xmask, tmp4, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tl.where(xmask, tmp5, 0)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp10[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tmp11[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp13, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x0), tmp14, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (8 - 1)) / (8));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_per_fused_native_group_norm_39 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_per_fused_native_group_norm_39 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cj4bp4jzrxnhmbr3e2scgoaq5td27c3fdzfqpe4mqhhudol6hpty.cubin", "triton_per_fused_native_group_norm_39", 32, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_377 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_378 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_379 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_380 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_381 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_382 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_383 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_384 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_377, &var_378, &var_379, &var_380, &var_381, &var_382, &var_383, &global_scratch_384};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_per_fused_native_group_norm_39, grid_0, grid_1, grid_2, 2, 32, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_40(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_40', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 131072},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_40', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_40(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 131072
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_40 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_40 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/conw5fxkzfoxhoyfcwfpxktjs6s35dzi7xuf64335cmzpbynf6qb.cubin", "triton_poi_fused__to_copy_convolution_40", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_385 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_386 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_387 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_388 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_385, &var_386, &var_387, &global_scratch_388};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_40, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 4194304},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1 // 8), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tl.load(in_ptr3 + (x1 // 8), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp2.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 - tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = 128*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tmp7.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = (tmp6 / tmp8)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp9 + tmp10
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = libdevice.rsqrt(tmp11)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = tmp5 * tmp12
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 * tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp15 + tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.sigmoid(tmp17)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 * tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tmp19.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp20, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cn32ldycezggq5ntm4nizx6k2eqojr6gs75ioxqikw6ydkbcighw.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_389 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_390 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_391 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_392 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_393 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_394 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_395 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_396 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_397 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_398 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_399 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_400 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_389, &var_390, &var_391, &var_392, &var_393, &var_394, &var_395, &var_396, &var_397, &var_398, &var_399, &global_scratch_400};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_42(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_42', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 1048576},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_42', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_sigmoid_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 589824
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x0), None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x0), tmp1, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_42 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_42 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cfj4tf6wgkmcgrjmlx7vtnbevgypigbvvy2derctmuvlat5k5xq6.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_42", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_401 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_402 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_403 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_404 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_401, &var_402, &var_403, &global_scratch_404};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_42, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_43(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_43', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 512, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_43', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_43(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 512
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 16)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tl.load(in_ptr1 + (8*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3 = tl.load(in_ptr2 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp4 = tl.load(in_ptr3 + (8*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp7 = tmp6.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_mean_next, tmp9_m2_next, tmp9_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp8, tmp9_mean, tmp9_m2, tmp9_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_mean = tl.where(r0_mask & xmask, tmp9_mean_next, tmp9_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_m2 = tl.where(r0_mask & xmask, tmp9_m2_next, tmp9_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp9_weight = tl.where(r0_mask & xmask, tmp9_weight_next, tmp9_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12, tmp13, tmp14 = triton_helpers.welford(tmp9_mean, tmp9_m2, tmp9_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp12[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp13[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = tmp14[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp9, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp10, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp11, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_43 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_43 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ciooem3z35kwd5bzfyhf5o3vvexbszb67gobf5kuget7qhzns573.cubin", "triton_red_fused_native_group_norm_43", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_405 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_406 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_407 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_408 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_409 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_410 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_411 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_412 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_413 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_414 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_415 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_416 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_417 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_405, &var_406, &var_407, &var_408, &var_409, &var_410, &var_411, &var_412, &var_413, &var_414, &var_415, &var_416, &global_scratch_417};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_43, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename in_ptr7_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr5_type_& in_ptr5,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr6_type_& in_ptr6,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr7_type_& in_ptr7,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 4194304},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'in_ptr7': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.load(in_ptr2 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (x1 // 8), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tl.load(in_ptr5 + (x1 // 8), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp18 = tl.load(in_ptr6 + (x1), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp20 = tl.load(in_ptr7 + (x1), None, eviction_policy='evict_last')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tmp6.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 - tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp11 = 128*ks1*ks2
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp12 = tmp11.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp13 = (tmp10 / tmp12)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp14 = 1e-06
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp15 = tmp13 + tmp14
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp16 = libdevice.rsqrt(tmp15)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp17 = tmp9 * tmp16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp19 = tmp17 * tmp18
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp21 = tmp19 + tmp20
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp22 = tl.sigmoid(tmp21)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp23 = tmp21 * tmp22
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp24 = tmp23.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x2), tmp24, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cc4joqjeiqk42d5ccbwlnsic3wwcoyv45f2h7f6rpovczqalzpe2.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_418 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_419 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_420 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_421 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_422 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_423 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_424 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_425 = reinterpret_cast<CUdeviceptr>(in_ptr7.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_426 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_427 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_428 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_429 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_430 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_431 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_418, &var_419, &var_420, &var_421, &var_422, &var_423, &var_424, &var_425, &var_426, &var_427, &var_428, &var_429, &var_430, &global_scratch_431};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_out_ptr0_type_& in_out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr3_type_& in_ptr3,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr4_type_& in_ptr4,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.pointwise(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 4194304},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         min_elem_per_thread=0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, ks0, xnumel, XBLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = tl.full([XBLOCK], True, tl.int1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x2 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // ks0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp0 = tl.load(in_out_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp1 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tl.load(in_ptr1 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp7 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp8 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last').to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp2 = tmp0 + tmp1
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp3 + tmp4
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6 = tmp2 + tmp5
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp9 = tmp7 + tmp8
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp10 = tmp6 + tmp9
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(in_out_ptr0 + (x2), tmp10, None)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnxzgbgs6spla2etszz7nxrw2jgnxf7g2ulsrkz3crsqjjjrcefv.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45", 0, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_432 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_433 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_434 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_435 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_436 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_437 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_438 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_439 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_440 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_432, &var_433, &var_434, &var_435, &var_436, &var_437, &var_438, &var_439, &global_scratch_440};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_46(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr0_type_& out_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr1_type_& out_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const out_ptr2_type_& out_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t ks2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t xnumel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int64_t r0_numel,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     cudaStream_t stream_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     kernels_type_& kernels_,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const std::optional<std::string>& cubin_dir_ = std::nullopt
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     /*
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     async_compile.triton('triton_red_fused_native_group_norm_46', '''
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     import triton.language as tl
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime import triton_helpers, triton_heuristics
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     triton_helpers.set_driver_to_gpu()
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton_heuristics.reduction(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         size_hints={'x': 512, 'r0_': 8192},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         reduction_hint=ReductionHint.INNER,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         filename=__file__,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_46', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     @triton.jit
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     def triton_red_fused_native_group_norm_46(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xnumel = 512
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rnumel = r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         RBLOCK: tl.constexpr = R0_BLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xoffset = tl.program_id(0) * XBLOCK
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         xmask = xindex < xnumel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         r0_base = tl.arange(0, R0_BLOCK)[None, :]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         rbase = r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x0 = (xindex % 16)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x1 = xindex // 16
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         x3 = xindex
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         for r0_offset in range(0, r0_numel, R0_BLOCK):
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_index = r0_offset + r0_base
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_mask = r0_index < r0_numel
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             roffset = r0_offset
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             rindex = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             r0_2 = r0_index
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp1 = tmp0.to(tl.float32)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]                 tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             )
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]             tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp3 = tmp6[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp4 = tmp7[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tmp5 = tmp8[:, None]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr0 + (x3), tmp3, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr1 + (x3), tmp4, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         tl.store(out_ptr2 + (x3), tmp5, xmask)
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     ''', device_str='cuda')
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     */
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_0 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_1 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     uint32_t grid_2 = 1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     if (kernels_.triton_red_fused_native_group_norm_46 == nullptr) {
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]         kernels_.triton_red_fused_native_group_norm_46 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cjh6z4avfhdhnsmc422ejqmgto2qf746pwta7xtecnwk6vibgis2.cubin", "triton_red_fused_native_group_norm_46", 192, cubin_dir_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_441 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_442 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_443 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr var_444 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr());
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_445 = ks0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_446 = ks1;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_447 = ks2;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int var_448 = xnumel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     int32_t var_449 = r0_numel;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     CUdeviceptr global_scratch_450 = 0;
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     void* kernel_args_[] = {&var_441, &var_442, &var_443, &var_444, &var_445, &var_446, &var_447, &var_448, &var_449, &global_scratch_450};
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     launchKernel(kernels_.triton_red_fused_native_group_norm_46, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_);
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_>
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_47(
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr0_type_& in_ptr0,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr1_type_& in_ptr1,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code]     const in_ptr2_type_& in_ptr2,
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.p