-
-
Save tugsbayasgalan/b03f5c3a3259c6006cbc8c878d3443ac to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<frozen importlib._bootstrap_external>:1184: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. | |
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. | |
Device set to use cuda | |
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 | |
W0512 13:27:46.019000 3957859 site-packages/torch/_export/non_strict_utils.py:499] dimension inputs['z'].shape[0] 0/1 specialized; Dim.AUTO was specified along with a sample input with hint = 1. | |
V0512 13:27:56.870000 3957859 site-packages/torch/_inductor/codegen/wrapper.py:1471] [__output_code] Auto-tuning code written to /tmp/torchinductor_tmanlaibaatar/tmpsy995vep.py | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] Output wrapper code: | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <torch/csrc/inductor/aoti_include/cuda.h> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // Definition of AOTI runtime interface functions | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <torch/csrc/inductor/aoti_runtime/interface.h> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <torch/csrc/inductor/aoti_runtime/model_container.h> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <iostream> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <sstream> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <stdexcept> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #include <vector> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] try { \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] __VA_ARGS__ \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } catch (const std::exception& e) { \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::cerr << "Error: " << e.what() << std::endl; \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return AOTI_RUNTIME_FAILURE; \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } catch (...) { \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::cerr << "Unknown exception occurred." << std::endl; \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return AOTI_RUNTIME_FAILURE; \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return AOTI_RUNTIME_SUCCESS; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] do { \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTI_RUNTIME_CHECK( \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] actual_size == expected_size, \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] "expected " + std::string(name) + " vector size to be " + \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::to_string(expected_size) + ", but got " + \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::to_string(actual_size)); \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } while (0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // AOTInductor uses at::addmm_out, which doesn't supports | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // arguments that requires gradient. For this reason, we | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // enforce no_grad context for run APIs. | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // A RAII, thread local (!) guard that enables or disables grad mode upon | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // construction, and sets it back to the original value upon destruction. | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] struct AOTINoGradGuard { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTINoGradGuard() : prev_mode(aoti_torch_grad_mode_is_enabled()) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] aoti_torch_grad_mode_set_enabled(false); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ~AOTINoGradGuard() { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] aoti_torch_grad_mode_set_enabled(prev_mode); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool prev_mode; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] extern "C" { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerCreate( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle* container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t num_models, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool is_cpu, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char* cubin_dir) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return AOTInductorModelContainerCreateWithDevice( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] num_models, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] is_cpu ? "cpu" : "cuda", | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cubin_dir); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle* container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t num_models, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char* device_str, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char* cubin_dir) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (num_models == 0) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::cerr << "Error: num_models must be positive, but got 0" << std::endl; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return AOTI_RUNTIME_FAILURE; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::optional<std::string> cubin_dir_opt; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (cubin_dir != nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cubin_dir_opt.emplace(cubin_dir); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = new torch::aot_inductor::AOTInductorModelContainer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] num_models, std::string(device_str), cubin_dir_opt); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] *container_handle = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<AOTInductorModelContainerHandle>(container); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerDelete( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] delete container; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerRun( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // are stolen; the array itself is borrowed | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t num_inputs, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AtenTensorHandle* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] output_handles, // array for writing output AtenTensorHandle; handles | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // will be stolen by the caller; the array itself is | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // borrowed | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t num_outputs, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorStreamHandle stream_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIProxyExecutorHandle proxy_executor_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto stream = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTINoGradGuard guard; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container->run( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] input_handles, output_handles, stream, proxy_executor_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // are stolen; the array itself is borrowed | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t num_inputs, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AtenTensorHandle* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] output_handles, // array for writing output AtenTensorHandle; handles | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // will be stolen by the caller; the array itself is | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] // borrowed | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t num_outputs, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorStreamHandle stream_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIProxyExecutorHandle proxy_executor_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto stream = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTINoGradGuard guard; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container->run_single_threaded( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] input_handles, output_handles, stream, proxy_executor_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetNumConstants( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t* num_constants) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *num_constants = container->num_constants(); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantName( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t idx, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char** name) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *name = container->constant_name(idx); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t idx, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char** original_fqn) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *original_fqn = container->constant_original_fqn(idx); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t idx, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool* from_folded) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantType( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t idx, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t* type) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t idx, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t* dtype) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *dtype = container->constant_dtype(idx); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorConstantMapHandle constant_map_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool use_inactive) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { const auto ret = container->extract_constants_map(use_inactive); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for (const auto& pair: ret) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_map->emplace(pair.first, pair.second); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorConstantMapHandle constant_map_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool use_inactive, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool validate_full_update) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container->update_constant_buffer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] *input_map, use_inactive, validate_full_update, /* user_managed = */ true); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorConstantMapHandle constant_map_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool use_inactive, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool validate_full_update) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container->update_constant_buffer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] *input_map, use_inactive, validate_full_update); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorConstantMapHandle constant_map_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return AOTInductorModelContainerUpdateConstantBuffer(container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constant_map_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /*use_inactive*/ true, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /*validate_full_update*/ true); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container->free_inactive_constant_buffer(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool use_inactive, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorStreamHandle stream_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIProxyExecutorHandle proxy_executor_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto stream = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTINoGradGuard guard; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container->run_const_fold(use_inactive, stream, proxy_executor_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container->swap_constant_buffer(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetNumInputs( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t* ret_num_inputs) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *ret_num_inputs = container->num_inputs(); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetInputName( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t input_idx, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char** ret_input_names) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *ret_input_names = container->input_name(input_idx); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t* ret_num_outputs) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *ret_num_outputs = container->num_outputs(); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetOutputName( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t output_idx, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char** ret_output_names) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] { *ret_output_names = container->output_name(output_idx); }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelContainerGetCallSpec( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelContainerHandle container_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char** in_spec, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char** out_spec) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto* container = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] container_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] *in_spec = container->get_in_spec(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] *out_spec = container->get_out_spec(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelCreate( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelHandle* model_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorConstantMapHandle constant_map_handle){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto model = new torch::aot_inductor::AOTInductorModel( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constant_map, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constant_array, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] "" | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (input_map) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for (auto const& kv : *input_map) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constant_map->emplace(kv.first, kv.second); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } else { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] model->load_constants(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] *model_handle = reinterpret_cast<AOTInductorModelHandle>(model); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] })} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelRun( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelHandle model_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AtenTensorHandle* input_handles, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AtenTensorHandle* output_handles) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto model = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTINoGradGuard guard; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] model->run_impl( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] input_handles, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] output_handles, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] (torch::aot_inductor::DeviceStreamType) nullptr, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] nullptr); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] model_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] delete model; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] })} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelGetNumOutputs( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelHandle model_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_t* ret_num_outputs) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] *ret_num_outputs = model->num_outputs(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIRuntimeError AOTInductorModelUpdateConstantsMap( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModelHandle model_handle, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorConstantMapHandle constant_map_handle) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto model = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CONVERT_EXCEPTION_TO_ERROR_CODE({ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] auto input_map = | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constant_map_handle); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for (auto const& kv : *input_map) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constant_map->emplace(kv.first, kv.second); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] model->update_constants_map(std::move(constant_map)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } // extern "C" | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] #define CUDA_DRIVER_CHECK(EXPR) \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] do { \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUresult code = EXPR; \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const char *msg; \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUresult code_get_error = cuGetErrorString(code, &msg); \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (code_get_error != CUDA_SUCCESS) { \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] throw std::runtime_error( \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::string("CUDA driver error: ") + \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::string("invalid error code!")); \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (code != CUDA_SUCCESS) { \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] throw std::runtime_error( \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::string("CUDA driver error: ") + \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::string(msg)); \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } \ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } while (0); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline CUfunction loadKernel( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::string filePath, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::string &funcName, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t sharedMemBytes, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string> &cubinDir = std::nullopt) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (cubinDir) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::filesystem::path p1{*cubinDir}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::filesystem::path p2{filePath}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filePath = (p1 / p2.filename()).string(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUmodule mod; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction func; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (sharedMemBytes > 0) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUDA_DRIVER_CHECK(cuFuncSetAttribute( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] func, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] sharedMemBytes | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] )) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return func; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void launchKernel( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction func, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t gridX, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t gridY, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t gridZ, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t numWarps, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t sharedMemBytes, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* args[], | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUDA_DRIVER_CHECK(cuLaunchKernel( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] )); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_DTYPE(bfloat16); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_DTYPE(float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_DEVICE(cuda); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CACHE_TORCH_LAYOUT(strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] namespace torch::aot_inductor { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] namespace { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] class AOTInductorModelKernels : public AOTInductorModelKernelsBase { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] public: | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_per_fused_native_group_norm_24{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_per_fused_native_group_norm_34{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_per_fused_native_group_norm_39{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_per_fused_native_group_norm_4{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__scaled_dot_product_efficient_attention_clone_11{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_48{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_49{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_add_convolution_mul_sigmoid_60{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_0{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_1{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_10{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_2{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_40{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_55{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_47{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_51{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_56{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_59{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_62{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_64{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_36{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_37{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_42{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_52{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_53{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_57{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_6{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_65{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_66{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_mul_sigmoid_67{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_poi_fused__to_copy_convolution_native_group_norm_9{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_13{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_15{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_18{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_20{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_23{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_26{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_29{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_3{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_33{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_38{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_43{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_46{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_50{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_54{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_58{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_61{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_63{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_7{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUfunction triton_red_fused_native_group_norm_8{nullptr}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] }; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } // namespace | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::shared_ptr<std::vector<ConstantHandle>> constants_array, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::string& device_str, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::optional<std::string> cubin_dir, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool include_weights) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] : AOTInductorModelBase(1, 1, 137, device_str, cubin_dir, true) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inputs_info_[0].name = "arg138_1"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].name = "conv_in_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].data_size = 294912; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].shape = {512, 16, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].stride = {144, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[0].original_fqn = "conv_in.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].name = "conv_in_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[1].original_fqn = "conv_in.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].name = "mid_block_1_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[2].original_fqn = "mid.block_1.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].name = "mid_block_1_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[3].original_fqn = "mid.block_1.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].name = "mid_block_1_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[4].original_fqn = "mid.block_1.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].name = "mid_block_1_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[5].original_fqn = "mid.block_1.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].name = "mid_block_1_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[6].original_fqn = "mid.block_1.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].name = "mid_block_1_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[7].original_fqn = "mid.block_1.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].name = "mid_block_1_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[8].original_fqn = "mid.block_1.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].name = "mid_block_1_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[9].original_fqn = "mid.block_1.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].name = "mid_attn_1_norm_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[10].original_fqn = "mid.attn_1.norm.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].name = "mid_attn_1_norm_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[11].original_fqn = "mid.attn_1.norm.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].name = "mid_attn_1_q_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].data_size = 1048576; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].shape = {512, 512, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].stride = {512, 1, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[12].original_fqn = "mid.attn_1.q.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].name = "mid_attn_1_q_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[13].original_fqn = "mid.attn_1.q.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].name = "mid_attn_1_k_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].data_size = 1048576; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].shape = {512, 512, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].stride = {512, 1, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[14].original_fqn = "mid.attn_1.k.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].name = "mid_attn_1_k_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[15].original_fqn = "mid.attn_1.k.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].name = "mid_attn_1_v_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].data_size = 1048576; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].shape = {512, 512, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].stride = {512, 1, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[16].original_fqn = "mid.attn_1.v.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].name = "mid_attn_1_v_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[17].original_fqn = "mid.attn_1.v.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].name = "mid_attn_1_proj_out_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].data_size = 1048576; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].shape = {512, 512, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].stride = {512, 1, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[18].original_fqn = "mid.attn_1.proj_out.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].name = "mid_attn_1_proj_out_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[19].original_fqn = "mid.attn_1.proj_out.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].name = "mid_block_2_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[20].original_fqn = "mid.block_2.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].name = "mid_block_2_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[21].original_fqn = "mid.block_2.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].name = "mid_block_2_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[22].original_fqn = "mid.block_2.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].name = "mid_block_2_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[23].original_fqn = "mid.block_2.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].name = "mid_block_2_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[24].original_fqn = "mid.block_2.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].name = "mid_block_2_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[25].original_fqn = "mid.block_2.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].name = "mid_block_2_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[26].original_fqn = "mid.block_2.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].name = "mid_block_2_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[27].original_fqn = "mid.block_2.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].name = "up_0_block_0_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[28].original_fqn = "up.0.block.0.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].name = "up_0_block_0_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[29].original_fqn = "up.0.block.0.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].name = "up_0_block_0_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].data_size = 1179648; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].shape = {128, 256, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].stride = {2304, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[30].original_fqn = "up.0.block.0.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].name = "up_0_block_0_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[31].original_fqn = "up.0.block.0.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].name = "up_0_block_0_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[32].original_fqn = "up.0.block.0.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].name = "up_0_block_0_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[33].original_fqn = "up.0.block.0.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].name = "up_0_block_0_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].data_size = 589824; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].shape = {128, 128, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].stride = {1152, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[34].original_fqn = "up.0.block.0.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].name = "up_0_block_0_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[35].original_fqn = "up.0.block.0.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].name = "up_0_block_0_nin_shortcut_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].data_size = 131072; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].shape = {128, 256, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].stride = {256, 1, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[36].original_fqn = "up.0.block.0.nin_shortcut.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].name = "up_0_block_0_nin_shortcut_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[37].original_fqn = "up.0.block.0.nin_shortcut.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].name = "up_0_block_1_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[38].original_fqn = "up.0.block.1.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].name = "up_0_block_1_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[39].original_fqn = "up.0.block.1.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].name = "up_0_block_1_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].data_size = 589824; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].shape = {128, 128, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].stride = {1152, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[40].original_fqn = "up.0.block.1.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].name = "up_0_block_1_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[41].original_fqn = "up.0.block.1.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].name = "up_0_block_1_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[42].original_fqn = "up.0.block.1.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].name = "up_0_block_1_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[43].original_fqn = "up.0.block.1.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].name = "up_0_block_1_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].data_size = 589824; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].shape = {128, 128, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].stride = {1152, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[44].original_fqn = "up.0.block.1.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].name = "up_0_block_1_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[45].original_fqn = "up.0.block.1.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].name = "up_0_block_2_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[46].original_fqn = "up.0.block.2.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].name = "up_0_block_2_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[47].original_fqn = "up.0.block.2.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].name = "up_0_block_2_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].data_size = 589824; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].shape = {128, 128, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].stride = {1152, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[48].original_fqn = "up.0.block.2.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].name = "up_0_block_2_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[49].original_fqn = "up.0.block.2.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].name = "up_0_block_2_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[50].original_fqn = "up.0.block.2.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].name = "up_0_block_2_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[51].original_fqn = "up.0.block.2.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].name = "up_0_block_2_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].data_size = 589824; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].shape = {128, 128, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].stride = {1152, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[52].original_fqn = "up.0.block.2.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].name = "up_0_block_2_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[53].original_fqn = "up.0.block.2.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].name = "up_1_block_0_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[54].original_fqn = "up.1.block.0.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].name = "up_1_block_0_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[55].original_fqn = "up.1.block.0.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].name = "up_1_block_0_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].data_size = 4718592; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].shape = {256, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[56].original_fqn = "up.1.block.0.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].name = "up_1_block_0_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[57].original_fqn = "up.1.block.0.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].name = "up_1_block_0_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[58].original_fqn = "up.1.block.0.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].name = "up_1_block_0_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[59].original_fqn = "up.1.block.0.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].name = "up_1_block_0_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].data_size = 2359296; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].shape = {256, 256, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].stride = {2304, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[60].original_fqn = "up.1.block.0.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].name = "up_1_block_0_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[61].original_fqn = "up.1.block.0.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].name = "up_1_block_0_nin_shortcut_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].data_size = 524288; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].shape = {256, 512, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].stride = {512, 1, 1, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[62].original_fqn = "up.1.block.0.nin_shortcut.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].name = "up_1_block_0_nin_shortcut_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[63].original_fqn = "up.1.block.0.nin_shortcut.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].name = "up_1_block_1_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[64].original_fqn = "up.1.block.1.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].name = "up_1_block_1_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[65].original_fqn = "up.1.block.1.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].name = "up_1_block_1_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].data_size = 2359296; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].shape = {256, 256, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].stride = {2304, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[66].original_fqn = "up.1.block.1.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].name = "up_1_block_1_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[67].original_fqn = "up.1.block.1.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].name = "up_1_block_1_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[68].original_fqn = "up.1.block.1.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].name = "up_1_block_1_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[69].original_fqn = "up.1.block.1.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].name = "up_1_block_1_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].data_size = 2359296; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].shape = {256, 256, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].stride = {2304, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[70].original_fqn = "up.1.block.1.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].name = "up_1_block_1_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[71].original_fqn = "up.1.block.1.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].name = "up_1_block_2_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[72].original_fqn = "up.1.block.2.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].name = "up_1_block_2_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[73].original_fqn = "up.1.block.2.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].name = "up_1_block_2_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].data_size = 2359296; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].shape = {256, 256, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].stride = {2304, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[74].original_fqn = "up.1.block.2.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].name = "up_1_block_2_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[75].original_fqn = "up.1.block.2.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].name = "up_1_block_2_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[76].original_fqn = "up.1.block.2.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].name = "up_1_block_2_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[77].original_fqn = "up.1.block.2.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].name = "up_1_block_2_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].data_size = 2359296; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].shape = {256, 256, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].stride = {2304, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[78].original_fqn = "up.1.block.2.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].name = "up_1_block_2_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[79].original_fqn = "up.1.block.2.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].name = "up_1_upsample_conv_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].data_size = 2359296; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].shape = {256, 256, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].stride = {2304, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[80].original_fqn = "up.1.upsample.conv.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].name = "up_1_upsample_conv_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].data_size = 1024; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].shape = {256}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[81].original_fqn = "up.1.upsample.conv.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].name = "up_2_block_0_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[82].original_fqn = "up.2.block.0.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].name = "up_2_block_0_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[83].original_fqn = "up.2.block.0.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].name = "up_2_block_0_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[84].original_fqn = "up.2.block.0.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].name = "up_2_block_0_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[85].original_fqn = "up.2.block.0.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].name = "up_2_block_0_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[86].original_fqn = "up.2.block.0.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].name = "up_2_block_0_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[87].original_fqn = "up.2.block.0.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].name = "up_2_block_0_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[88].original_fqn = "up.2.block.0.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].name = "up_2_block_0_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[89].original_fqn = "up.2.block.0.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].name = "up_2_block_1_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[90].original_fqn = "up.2.block.1.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].name = "up_2_block_1_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[91].original_fqn = "up.2.block.1.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].name = "up_2_block_1_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[92].original_fqn = "up.2.block.1.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].name = "up_2_block_1_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[93].original_fqn = "up.2.block.1.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].name = "up_2_block_1_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[94].original_fqn = "up.2.block.1.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].name = "up_2_block_1_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[95].original_fqn = "up.2.block.1.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].name = "up_2_block_1_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[96].original_fqn = "up.2.block.1.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].name = "up_2_block_1_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[97].original_fqn = "up.2.block.1.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].name = "up_2_block_2_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[98].original_fqn = "up.2.block.2.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].name = "up_2_block_2_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[99].original_fqn = "up.2.block.2.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].name = "up_2_block_2_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[100].original_fqn = "up.2.block.2.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].name = "up_2_block_2_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[101].original_fqn = "up.2.block.2.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].name = "up_2_block_2_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[102].original_fqn = "up.2.block.2.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].name = "up_2_block_2_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[103].original_fqn = "up.2.block.2.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].name = "up_2_block_2_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[104].original_fqn = "up.2.block.2.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].name = "up_2_block_2_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[105].original_fqn = "up.2.block.2.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].name = "up_2_upsample_conv_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[106].original_fqn = "up.2.upsample.conv.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].name = "up_2_upsample_conv_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[107].original_fqn = "up.2.upsample.conv.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].name = "up_3_block_0_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[108].original_fqn = "up.3.block.0.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].name = "up_3_block_0_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[109].original_fqn = "up.3.block.0.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].name = "up_3_block_0_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[110].original_fqn = "up.3.block.0.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].name = "up_3_block_0_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[111].original_fqn = "up.3.block.0.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].name = "up_3_block_0_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[112].original_fqn = "up.3.block.0.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].name = "up_3_block_0_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[113].original_fqn = "up.3.block.0.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].name = "up_3_block_0_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[114].original_fqn = "up.3.block.0.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].name = "up_3_block_0_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[115].original_fqn = "up.3.block.0.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].name = "up_3_block_1_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[116].original_fqn = "up.3.block.1.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].name = "up_3_block_1_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[117].original_fqn = "up.3.block.1.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].name = "up_3_block_1_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[118].original_fqn = "up.3.block.1.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].name = "up_3_block_1_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[119].original_fqn = "up.3.block.1.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].name = "up_3_block_1_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[120].original_fqn = "up.3.block.1.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].name = "up_3_block_1_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[121].original_fqn = "up.3.block.1.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].name = "up_3_block_1_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[122].original_fqn = "up.3.block.1.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].name = "up_3_block_1_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[123].original_fqn = "up.3.block.1.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].name = "up_3_block_2_norm1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[124].original_fqn = "up.3.block.2.norm1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].name = "up_3_block_2_norm1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[125].original_fqn = "up.3.block.2.norm1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].name = "up_3_block_2_conv1_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[126].original_fqn = "up.3.block.2.conv1.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].name = "up_3_block_2_conv1_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[127].original_fqn = "up.3.block.2.conv1.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].name = "up_3_block_2_norm2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[128].original_fqn = "up.3.block.2.norm2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].name = "up_3_block_2_norm2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[129].original_fqn = "up.3.block.2.norm2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].name = "up_3_block_2_conv2_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[130].original_fqn = "up.3.block.2.conv2.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].name = "up_3_block_2_conv2_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[131].original_fqn = "up.3.block.2.conv2.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].name = "up_3_upsample_conv_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].data_size = 9437184; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].shape = {512, 512, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].stride = {4608, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[132].original_fqn = "up.3.upsample.conv.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].name = "up_3_upsample_conv_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].data_size = 2048; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].shape = {512}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[133].original_fqn = "up.3.upsample.conv.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].name = "norm_out_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[134].original_fqn = "norm_out.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].name = "norm_out_bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].data_size = 512; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].shape = {128}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].stride = {1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[135].original_fqn = "norm_out.bias"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].name = "conv_out_weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].dtype = static_cast<int32_t>(cached_torch_dtype_float32); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].offset = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].data_size = 13824; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].from_folded = false; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].shape = {3, 128, 3, 3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].stride = {1152, 9, 3, 1}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].layout = static_cast<int32_t>(cached_torch_layout_strided); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] constants_info_[136].original_fqn = "conv_out.weight"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] update_constants_map(std::move(constants_map)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] update_constants_array(std::move(constants_array)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] in_spec_ = "[1, {\"type\": \"builtins.tuple\", \"context\": \"null\", \"children_spec\": [{\"type\": \"builtins.tuple\", \"context\": \"null\", \"children_spec\": [{\"type\": null, \"context\": null, \"children_spec\": []}]}, {\"type\": \"builtins.dict\", \"context\": \"[]\", \"children_spec\": []}]}]"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] out_spec_ = "[1, {\"type\": null, \"context\": null, \"children_spec\": []}]"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] outputs_info_[0].name = "output0"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] this->kernels_ = std::make_unique<AOTInductorModelKernels>(); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] DeviceStreamType stream, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] AOTIProxyExecutorHandle proxy_executor, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] bool initialization | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (!initialization) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] << "aot_inductor.use_runtime_constant_folding=False\n"; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] return {}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } // namespace torch::aot_inductor | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] using namespace torch::aot_inductor; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_0( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_0', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 16384}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_0 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_0 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cgxxjgxrusrntatjx5abwxgn54didxlre2pjfyn2zvpdmi3ef4jq.cubin", "triton_poi_fused__to_copy_convolution_0", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_2 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_3 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_0, &var_1, &var_2, &global_scratch_3}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_0, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_1( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_1', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 131072}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 73728 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_1 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_1 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c7nh2ovduspc5ybbtrsjalmrhgkv3sixfefklqe2uqflq32po54w.cubin", "triton_poi_fused__to_copy_convolution_1", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_4 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_6 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_7 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_4, &var_5, &var_6, &global_scratch_7}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_1, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_2( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_2', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 512}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 512 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_2 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_2 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c3evy4rcaln7lqsl6xndoppq5vawyf6l7pleyr7qzmlan2s2lcbk.cubin", "triton_poi_fused__to_copy_convolution_2", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_8 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_9 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_10 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_11 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_8, &var_9, &var_10, &global_scratch_11}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_2, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_3( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_3', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 64, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_3(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 64 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (ks0*ks1*((((r0_2 + 8*ks0*ks1*x0) // (ks0*ks1)) % 16)) + 16*ks0*ks1*x1 + ((((r0_2 % (ks0*ks1))) % (ks0*ks1)))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks0*ks1*x0) // (ks0*ks1)) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp9[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp6, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp7, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_3 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_3 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cw5dqtidbmmszjcofl4rzmbldngnokquurk3ckhqfx4yk3o6rttr.cubin", "triton_red_fused_native_group_norm_3", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_12 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_13 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_14 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_15 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_16 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_17 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_18 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_19 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_20 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_21 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_12, &var_13, &var_14, &var_15, &var_16, &var_17, &var_18, &var_19, &var_20, &global_scratch_21}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_3, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_4( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_per_fused_native_group_norm_4', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.persistent_reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 32, 'r0_': 2}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_per_fused_native_group_norm_4(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 32 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_numel = 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] R0_BLOCK: tl.constexpr = 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_offset = 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_1 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1 + 2*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (r0_1 + 2*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (r0_1 + 2*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.where(xmask, tmp3, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.where(xmask, tmp4, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tl.where(xmask, tmp5, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tmp11[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp13, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x0), tmp14, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (8 - 1)) / (8)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_per_fused_native_group_norm_4 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_per_fused_native_group_norm_4 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cyf5aory652x6536phtdflvp5a6acfvciiqgqz3ueruoebzgxtng.cubin", "triton_per_fused_native_group_norm_4", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_22 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_23 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_24 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_25 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_26 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_27 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_28 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_29 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_22, &var_23, &var_24, &var_25, &var_26, &var_27, &var_28, &global_scratch_29}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_per_fused_native_group_norm_4, grid_0, grid_1, grid_2, 2, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 524288}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 - tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = 16*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp7.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = (tmp6 / tmp8) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp9 + tmp10 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = libdevice.rsqrt(tmp11) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp5 * tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 * tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 + tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.sigmoid(tmp17) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 * tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tmp19.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp20, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c6v4bvys5ewmltqf3ivtpd7jhmo6spoliy2aeunfsje7jgwnsmw3.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_30 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_31 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_32 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_33 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_34 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_35 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_36 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_37 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_38 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_39 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_40 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_41 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_30, &var_31, &var_32, &var_33, &var_34, &var_35, &var_36, &var_37, &var_38, &var_39, &var_40, &global_scratch_41}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_5, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_6( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_6', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 4194304}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_sigmoid_6(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 2359296 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_6 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_6 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cqon4fmhenlp4qvhco6q5yusdakrngw3zldxf5reydmahju5yr5g.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_6", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_42 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_43 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_44 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_45 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_42, &var_43, &var_44, &global_scratch_45}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_6, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_7( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_7', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 64, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_7(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 64 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp9[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp6, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp7, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_7 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_7 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ctgaufphdl4ydfbxralbmsuk6x5y5iapr4cfau4wnsvalvze5gzv.cubin", "triton_red_fused_native_group_norm_7", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_46 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_47 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_48 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_49 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_50 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_51 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_52 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_53 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_54 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_55 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_56 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_46, &var_47, &var_48, &var_49, &var_50, &var_51, &var_52, &var_53, &var_54, &var_55, &global_scratch_56}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_7, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_8( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_8', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 64, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_8(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 64 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr2 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr3 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp6.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean_next, tmp9_m2_next, tmp9_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8, tmp9_mean, tmp9_m2, tmp9_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean = tl.where(r0_mask & xmask, tmp9_mean_next, tmp9_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_m2 = tl.where(r0_mask & xmask, tmp9_m2_next, tmp9_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_weight = tl.where(r0_mask & xmask, tmp9_weight_next, tmp9_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12, tmp13, tmp14 = triton_helpers.welford(tmp9_mean, tmp9_m2, tmp9_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp13[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp14[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp9, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp10, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp11, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_8 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_8 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c7dgp7owleeb43ezkfyawptppgcsfmwynfgsbo2qqvot2g3ckueb.cubin", "triton_red_fused_native_group_norm_8", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_57 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_58 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_59 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_60 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_61 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_62 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_63 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_64 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_65 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_66 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_67 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_68 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_69 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_57, &var_58, &var_59, &var_60, &var_61, &var_62, &var_63, &var_64, &var_65, &var_66, &var_67, &var_68, &global_scratch_69}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_8, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename in_ptr7_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename out_ptr3_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_native_group_norm_9( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr6_type_& in_ptr6, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr7_type_& in_ptr7, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr3_type_& out_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_native_group_norm_9', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 524288}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'in_ptr7': '*fp32', 'out_ptr1': '*bf16', 'out_ptr2': '*bf16', 'out_ptr3': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_native_group_norm_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_native_group_norm_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr1, out_ptr2, out_ptr3, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr2 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tl.load(in_ptr5 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tl.load(in_ptr7 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp6.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 - tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = 16*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = tmp11.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = (tmp10 / tmp12) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 + tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = libdevice.rsqrt(tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp9 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 * tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp21 = tmp19 + tmp20 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp22 = tmp21.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp22, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x2), tmp22, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr3 + (x2), tmp22, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_native_group_norm_9 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_native_group_norm_9 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/coanhxisptitfdasawvc6iwnfch3gcc4eivcrjds6y26a7jgxjzb.cubin", "triton_poi_fused__to_copy_convolution_native_group_norm_9", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_70 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_71 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_72 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_73 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_74 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_75 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_76 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_77 = reinterpret_cast<CUdeviceptr>(in_ptr7.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_78 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_79 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_80 = reinterpret_cast<CUdeviceptr>(out_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_81 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_82 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_83 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_84 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_85 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_70, &var_71, &var_72, &var_73, &var_74, &var_75, &var_76, &var_77, &var_78, &var_79, &var_80, &var_81, &var_82, &var_83, &var_84, &global_scratch_85}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_native_group_norm_9, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_10( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_10', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 262144}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_10', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_10(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 262144 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_10 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_10 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cyxtqg3wbaqtjvvsnpm76nco2oasofkesqzf2e6w6l2v3b34xu4t.cubin", "triton_poi_fused__to_copy_convolution_10", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_86 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_87 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_88 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_89 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_86, &var_87, &var_88, &global_scratch_89}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_10, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__scaled_dot_product_efficient_attention_clone_11( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ynumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__scaled_dot_product_efficient_attention_clone_11', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'y': 1024, 'x': 512}, tile_hint=TileHint.DEFAULT, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid2DWithYZOverflow', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_efficient_attention_clone_11', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__scaled_dot_product_efficient_attention_clone_11(in_ptr0, in_ptr1, out_ptr0, ks0, ks1, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 512 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] yindex = yoffset + tl.arange(0, YBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ymask = yindex < ynumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] y0 = yindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (y0 + ks0*ks1*x1), ymask & xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x1 + 512*y0), tmp2, ymask & xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t y_grid_raw_ = ((ynumel + (32 - 1)) / (32)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t y_grid_div_ = ((y_grid_raw_ + (65535 - 1)) / (65535)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = ((y_grid_raw_ + (y_grid_div_ - 1)) / (y_grid_div_)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = y_grid_div_; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__scaled_dot_product_efficient_attention_clone_11 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__scaled_dot_product_efficient_attention_clone_11 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnb6p6jfugm6m5kzhvoewfhdoxxnhuuq7776ulkafif652n3zfcz.cubin", "triton_poi_fused__scaled_dot_product_efficient_attention_clone_11", 2560, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_90 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_91 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_92 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_93 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_94 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_95 = ynumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_96 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_97 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_90, &var_91, &var_92, &var_93, &var_94, &var_95, &var_96, &global_scratch_97}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__scaled_dot_product_efficient_attention_clone_11, grid_0, grid_1, grid_2, 4, 2560, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_out_ptr0_type_& in_out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ynumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'y': 512, 'x': 1024}, tile_hint=TileHint.DEFAULT, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, ks0, ks1, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ynumel = 512 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] yoffset = tl.program_id(1) * YBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] yindex = yoffset + tl.arange(0, YBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ymask = yindex < ynumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] y0 = yindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x1 + ks0*ks1*y0), ymask & xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr0 + (y0), ymask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr1 + (x1 + ks0*ks1*y0), ymask & xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (y0), ymask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.load(in_ptr3 + (y0 + 512*x1), ymask & xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (y0), ymask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 + tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp6 + tmp9 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.debug_barrier() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(in_out_ptr0 + (x1 + ks0*ks1*y0), tmp10, ymask & xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = ((ynumel + (32 - 1)) / (32)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnyojbbwpuuzbl5nng3ndkyl6wdh2u2xssgt5rq6mhiboqnmpmmt.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12", 2112, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_98 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_99 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_100 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_101 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_102 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_103 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_104 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_105 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_106 = ynumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_107 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_108 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_98, &var_99, &var_100, &var_101, &var_102, &var_103, &var_104, &var_105, &var_106, &var_107, &global_scratch_108}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_12, grid_0, grid_1, grid_2, 4, 2112, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_13( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_13', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 64, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_13', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_13(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 64 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp6[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp7[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp3, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp4, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_13 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_13 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c3wyogxxlvfs3mp5obxu24c73ptc3pl5fr5npe5sqhfjl2vuh3fy.cubin", "triton_red_fused_native_group_norm_13", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_109 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_110 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_111 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_112 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_113 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_114 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_115 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_116 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_117 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_118 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_109, &var_110, &var_111, &var_112, &var_113, &var_114, &var_115, &var_116, &var_117, &global_scratch_118}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_13, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 524288}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr1 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 - tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = 16*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp5.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = (tmp4 / tmp6) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 + tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = libdevice.rsqrt(tmp9) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp3 * tmp10 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp11 * tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 + tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.sigmoid(tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tmp17.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp18, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ce66l2cfwtkucstdimfknnfvgcopcdq2vgdfqfu4f5yh4buok6mb.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_119 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_120 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_121 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_122 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_123 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_124 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_125 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_126 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_127 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_128 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_129 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_119, &var_120, &var_121, &var_122, &var_123, &var_124, &var_125, &var_126, &var_127, &var_128, &global_scratch_129}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_14, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_15( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_15', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 64, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_15', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_15(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 64 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 + tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp0 + tmp3 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp4.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp11[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp7, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp8, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp9, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_15 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_15 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnqjga7ngpvi4wizwxmrd26bprns73moojxkckff3lwjafghffbb.cubin", "triton_red_fused_native_group_norm_15", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_130 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_131 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_132 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_133 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_134 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_135 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_136 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_137 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_138 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_139 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_140 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_141 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_130, &var_131, &var_132, &var_133, &var_134, &var_135, &var_136, &var_137, &var_138, &var_139, &var_140, &global_scratch_141}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_15, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr6_type_& in_ptr6, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 524288}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 7, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 + tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp0 + tmp3 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp4.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp5 - tmp6 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = 16*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp9.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = (tmp8 / tmp10) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp11 + tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = libdevice.rsqrt(tmp13) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp7 * tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 + tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tl.sigmoid(tmp19) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp21 = tmp19 * tmp20 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp22 = tmp21.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp22, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cerzvyddgk6xjjm6rnz7xwp3oibvn5tgw6pbshclafd2olobrl6i.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_142 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_143 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_144 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_145 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_146 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_147 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_148 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_149 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_150 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_151 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_152 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_153 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_154 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_142, &var_143, &var_144, &var_145, &var_146, &var_147, &var_148, &var_149, &var_150, &var_151, &var_152, &var_153, &global_scratch_154}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_16, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 524288}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*fp32', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.load(in_ptr3 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 + tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp0 + tmp3 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp4.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp6 + tmp7 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp8.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp5 + tmp9 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x2), tmp10, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cc5fomvsznhjfks2obdqhntif5ulal7zx3dlnqxgptc55tazlzp6.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17", 4096, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_155 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_156 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_157 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_158 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_159 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_160 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_161 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_162 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_163 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_155, &var_156, &var_157, &var_158, &var_159, &var_160, &var_161, &var_162, &global_scratch_163}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_17, grid_0, grid_1, grid_2, 4, 4096, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_18( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_18', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 64, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_18', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_18(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 64 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((r0_2 % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2_mean_next, tmp2_m2_next, tmp2_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1, tmp2_mean, tmp2_m2, tmp2_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2_mean = tl.where(r0_mask & xmask, tmp2_mean_next, tmp2_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2_m2 = tl.where(r0_mask & xmask, tmp2_m2_next, tmp2_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2_weight = tl.where(r0_mask & xmask, tmp2_weight_next, tmp2_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5, tmp6, tmp7 = triton_helpers.welford(tmp2_mean, tmp2_m2, tmp2_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp5[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp6[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp7[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp2, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp3, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp4, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_18 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_18 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c672ewfhywvvjvvqmyycaeh4n67pdgbznw5zobkheaov2niu4irf.cubin", "triton_red_fused_native_group_norm_18", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_164 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_165 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_166 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_167 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_168 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_169 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_170 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_171 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_172 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_173 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_164, &var_165, &var_166, &var_167, &var_168, &var_169, &var_170, &var_171, &var_172, &global_scratch_173}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_18, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 524288}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 - tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = 16*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp4.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = (tmp3 / tmp5) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp6 + tmp7 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = libdevice.rsqrt(tmp8) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp2 * tmp9 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = tmp10 * tmp11 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tmp12 + tmp13 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tl.sigmoid(tmp14) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tmp14 * tmp15 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp16.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp17, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cshbdjfifm35yvmn4qp3vxv2imemcdoeytxtqrtfwba6wavhq6ze.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_174 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_175 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_176 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_177 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_178 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_179 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_180 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_181 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_182 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_183 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_184 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_174, &var_175, &var_176, &var_177, &var_178, &var_179, &var_180, &var_181, &var_182, &var_183, &global_scratch_184}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_19, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_20( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_20', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 64, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_20', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_20(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 64 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 16*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 + tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp3.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp0 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp11[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp7, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp8, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp9, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_20 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_20 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/crfju4ylwfirzczakk4i75xxohp6oblrrdokawkmezmkq6dldhnn.cubin", "triton_red_fused_native_group_norm_20", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_185 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_186 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_187 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_188 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_189 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_190 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_191 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_192 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_193 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_194 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_195 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_196 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_185, &var_186, &var_187, &var_188, &var_189, &var_190, &var_191, &var_192, &var_193, &var_194, &var_195, &global_scratch_196}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_20, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr6_type_& in_ptr6, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 524288}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 7, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 + tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp3.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp0 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp5 - tmp6 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = 16*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp9.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = (tmp8 / tmp10) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp11 + tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = libdevice.rsqrt(tmp13) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp7 * tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 + tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tl.sigmoid(tmp19) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp21 = tmp19 * tmp20 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp22 = tmp21.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp22, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cxqg5i22geotiyh56e6jwtjx7fxpr3dgsemhzymjf63gxuiwjyjz.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_197 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_198 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_199 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_200 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_201 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_202 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_203 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_204 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_205 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_206 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_207 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_208 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_209 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_197, &var_198, &var_199, &var_200, &var_201, &var_202, &var_203, &var_204, &var_205, &var_206, &var_207, &var_208, &global_scratch_209}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_21, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 2097152}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'ks4': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, ks0, ks1, ks2, ks3, ks4, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = ((xindex // ks1) % ks2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % ks1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex // ks4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp22 = tl.load(in_ptr2 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp27 = tl.load(in_ptr4 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = ((ks0.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks0.to(tl.float64))) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = x1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp3 * tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp4.to(tl.int64) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp5 + tmp6 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp5 < 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tl.where(tmp8, tmp7, tmp5) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = ((ks3.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks3.to(tl.float64))) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp10.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = x0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp12.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tmp13 * tmp11 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp14.to(tl.int64) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = ks3 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 + tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tmp15 < 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tl.where(tmp18, tmp17, tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tl.load(in_ptr0 + (tmp19 + ks3*tmp9 + ks0*ks3*x2), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp21 = tl.load(in_ptr1 + (tmp19 + ks3*tmp9 + ks0*ks3*x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp23 = tmp21 + tmp22 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp24 = tmp23.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp25 = tmp20 + tmp24 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp26 = tl.load(in_ptr3 + (tmp19 + ks3*tmp9 + ks0*ks3*x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp28 = tmp26 + tmp27 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp29 = tmp28.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp30 = tmp25 + tmp29 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp31 = tmp30.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp31, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c74i6lod2nm4yqm3gxjie5sdqhab5gsy4nw7acrdg2qn7c7toe4o.cubin", "triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_210 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_211 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_212 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_213 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_214 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_215 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_216 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_217 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_218 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_219 = ks3; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_220 = ks4; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_221 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_222 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_210, &var_211, &var_212, &var_213, &var_214, &var_215, &var_216, &var_217, &var_218, &var_219, &var_220, &var_221, &global_scratch_222}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_22, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_23( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_23', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 256, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_23', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_23(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 256 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 8) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp9[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp6, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp7, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_23 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_23 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ctwlo73csmgu7xibbxfmsczvr3pg3v7iekhys6dl4ijgd6z3rkgw.cubin", "triton_red_fused_native_group_norm_23", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_223 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_224 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_225 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_226 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_227 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_228 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_229 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_230 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_231 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_232 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_233 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_223, &var_224, &var_225, &var_226, &var_227, &var_228, &var_229, &var_230, &var_231, &var_232, &global_scratch_233}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_23, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_24( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_per_fused_native_group_norm_24', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.persistent_reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 32, 'r0_': 8}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_24', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_per_fused_native_group_norm_24(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 32 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_numel = 8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] R0_BLOCK: tl.constexpr = 8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_offset = 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_1 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1 + 8*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (r0_1 + 8*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (r0_1 + 8*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.where(xmask, tmp3, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.where(xmask, tmp4, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tl.where(xmask, tmp5, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tmp11[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp13, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x0), tmp14, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (8 - 1)) / (8)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_per_fused_native_group_norm_24 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_per_fused_native_group_norm_24 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cabjefmhsq4nvbq7xbsodpebbztvkrtwuhhrskefvh3uinxa2awr.cubin", "triton_per_fused_native_group_norm_24", 32, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_234 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_235 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_236 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_237 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_238 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_239 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_240 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_241 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_234, &var_235, &var_236, &var_237, &var_238, &var_239, &var_240, &global_scratch_241}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_per_fused_native_group_norm_24, grid_0, grid_1, grid_2, 2, 32, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 2097152}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.load(in_ptr3 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 - tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = 64*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp7.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = (tmp6 / tmp8) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp9 + tmp10 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = libdevice.rsqrt(tmp11) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp5 * tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 * tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 + tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.sigmoid(tmp17) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 * tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tmp19.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp20, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ces3hf5d632ipoxxctusejp5vs7esgaw5i3i6rq3r5mb3zvgooya.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_242 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_243 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_244 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_245 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_246 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_247 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_248 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_249 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_250 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_251 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_252 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_253 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_242, &var_243, &var_244, &var_245, &var_246, &var_247, &var_248, &var_249, &var_250, &var_251, &var_252, &global_scratch_253}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_25, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_26( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_26', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 256, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_26', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_26(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 256 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 8) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr2 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr3 + (16*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp6.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean_next, tmp9_m2_next, tmp9_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8, tmp9_mean, tmp9_m2, tmp9_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean = tl.where(r0_mask & xmask, tmp9_mean_next, tmp9_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_m2 = tl.where(r0_mask & xmask, tmp9_m2_next, tmp9_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_weight = tl.where(r0_mask & xmask, tmp9_weight_next, tmp9_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12, tmp13, tmp14 = triton_helpers.welford(tmp9_mean, tmp9_m2, tmp9_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp13[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp14[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp9, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp10, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp11, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_26 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_26 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/csusruwdq3ewpec2mijb54kievycqaj2fywnsptmdxjebzj7wfg5.cubin", "triton_red_fused_native_group_norm_26", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_254 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_255 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_256 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_257 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_258 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_259 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_260 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_261 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_262 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_263 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_264 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_265 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_266 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_254, &var_255, &var_256, &var_257, &var_258, &var_259, &var_260, &var_261, &var_262, &var_263, &var_264, &var_265, &global_scratch_266}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_26, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename in_ptr7_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr6_type_& in_ptr6, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr7_type_& in_ptr7, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 2097152}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'in_ptr7': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr2 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tl.load(in_ptr5 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tl.load(in_ptr7 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp6.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 - tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = 64*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = tmp11.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = (tmp10 / tmp12) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 + tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = libdevice.rsqrt(tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp9 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 * tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp21 = tmp19 + tmp20 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp22 = tl.sigmoid(tmp21) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp23 = tmp21 * tmp22 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp24 = tmp23.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp24, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cerpylu2mm64w35oojz7f34d725iijez6ymut46gsocdftasf4sn.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_267 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_268 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_269 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_270 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_271 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_272 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_273 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_274 = reinterpret_cast<CUdeviceptr>(in_ptr7.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_275 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_276 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_277 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_278 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_279 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_280 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_267, &var_268, &var_269, &var_270, &var_271, &var_272, &var_273, &var_274, &var_275, &var_276, &var_277, &var_278, &var_279, &global_scratch_280}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_27, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_out_ptr0_type_& in_out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 2097152}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, ks0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr0 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr1 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.load(in_ptr3 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 + tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp6 + tmp9 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(in_out_ptr0 + (x2), tmp10, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cna7x4qz26ppazei4h75gxjil3phwhoamgemi6f3q6hyl2pwqoyu.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_281 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_282 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_283 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_284 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_285 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_286 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_287 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_288 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_289 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_281, &var_282, &var_283, &var_284, &var_285, &var_286, &var_287, &var_288, &global_scratch_289}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_28, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_29( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_29', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 256, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_29', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_29(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 256 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 8) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (4*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 64*ks1*ks2*x1 + ((((r0_2 % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp6[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp7[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp3, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp4, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_29 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_29 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cagmz5pdlam2a7g56gbnkt5qx7bz5w6ehdm5536i3o4henvtc4nc.cubin", "triton_red_fused_native_group_norm_29", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_290 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_291 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_292 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_293 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_294 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_295 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_296 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_297 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_298 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_299 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_290, &var_291, &var_292, &var_293, &var_294, &var_295, &var_296, &var_297, &var_298, &global_scratch_299}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_29, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 2097152}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr1 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1 // 16), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = tl.load(in_ptr3 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 - tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = 64*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp5.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = (tmp4 / tmp6) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 + tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = libdevice.rsqrt(tmp9) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp3 * tmp10 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp11 * tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 + tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.sigmoid(tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tmp17.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp18, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cyr6hjuydoo47hk56ee2zmceyv5eaguqbpn4xnxhkvswp6kxusnv.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_300 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_301 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_302 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_303 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_304 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_305 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_306 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_307 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_308 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_309 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_310 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_300, &var_301, &var_302, &var_303, &var_304, &var_305, &var_306, &var_307, &var_308, &var_309, &global_scratch_310}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_30, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks6, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 8388608}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'ks4': 'i32', 'ks5': 'i32', 'ks6': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, ks3, ks4, ks5, ks6, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = ((xindex // ks1) % ks2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % ks1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex // ks4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp22 = tl.load(in_ptr2 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = ((ks0.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks0.to(tl.float64))) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = x1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp3 * tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp4.to(tl.int64) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp5 + tmp6 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp5 < 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tl.where(tmp8, tmp7, tmp5) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = ((ks3.to(tl.float64)) / (tl.full([], 2.00000000000000, tl.float64)*ks3.to(tl.float64))) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp10.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = x0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp12.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tmp13 * tmp11 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp14.to(tl.int64) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = ks3 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 + tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tmp15 < 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tl.where(tmp18, tmp17, tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tl.load(in_ptr0 + (tmp19 + 2*ks5*tmp9 + 4*ks5*ks6*x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp21 = tl.load(in_ptr1 + (tmp19 + 2*ks5*tmp9 + 4*ks5*ks6*x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp23 = tmp21 + tmp22 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp24 = tmp20 + tmp23 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp25 = tmp24.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp26 = tmp25.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp26, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ciatxcv44lyh4todkaep4o6twfjxyjyn7eo5vk3cd64g3zsq7r5g.cubin", "triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_311 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_312 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_313 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_314 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_315 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_316 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_317 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_318 = ks3; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_319 = ks4; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_320 = ks5; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_321 = ks6; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_322 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_323 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_311, &var_312, &var_313, &var_314, &var_315, &var_316, &var_317, &var_318, &var_319, &var_320, &var_321, &var_322, &global_scratch_323}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_31, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_out_ptr0_type_& in_out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 8388608}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32(in_out_ptr0, in_ptr0, ks0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(in_out_ptr0 + (x2), tmp2, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/c72tmv5oguk4xfujev6a5fibkw3pttym5ffhzintzl3ribyzm45t.cubin", "triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_324 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_325 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_326 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_327 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_328 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_324, &var_325, &var_326, &var_327, &global_scratch_328}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy__unsafe_index_add_convolution_mul_sigmoid_32, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_33( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_33', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 1024, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_33', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_33(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 1024 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 32 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 16)) + 256*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp6[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp7[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp3, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp4, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_33 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_33 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/clzfhzbtnqqa2q6zxs6izbsa4jom7ty3fckozqmq7ut4kk4n2hez.cubin", "triton_red_fused_native_group_norm_33", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_329 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_330 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_331 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_332 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_333 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_334 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_335 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_336 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_337 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_338 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_329, &var_330, &var_331, &var_332, &var_333, &var_334, &var_335, &var_336, &var_337, &global_scratch_338}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_33, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_34( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_per_fused_native_group_norm_34', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.persistent_reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 32, 'r0_': 32}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_34', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_per_fused_native_group_norm_34(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 32 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_numel = 32 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] R0_BLOCK: tl.constexpr = 32 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_offset = 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_1 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1 + 32*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (r0_1 + 32*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (r0_1 + 32*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.where(xmask, tmp3, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.where(xmask, tmp4, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tl.where(xmask, tmp5, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tmp11[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp13, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x0), tmp14, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (8 - 1)) / (8)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_per_fused_native_group_norm_34 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_per_fused_native_group_norm_34 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cp3mlrwytqztypeekbp62tyad6csd27igkrisuwn7urgixbsnts6.cubin", "triton_per_fused_native_group_norm_34", 32, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_339 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_340 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_341 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_342 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_343 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_344 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_345 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_346 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_339, &var_340, &var_341, &var_342, &var_343, &var_344, &var_345, &global_scratch_346}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_per_fused_native_group_norm_34, grid_0, grid_1, grid_2, 2, 32, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 8388608}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr1 + (x1 // 16), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1 // 16), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp1 - tmp2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = 256*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp5.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = (tmp4 / tmp6) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 + tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = libdevice.rsqrt(tmp9) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp3 * tmp10 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp11 * tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 + tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.sigmoid(tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tmp17.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp18, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cd6ovmq3a3hzsgvwyxirdqddks7xg2nquokhfglxjvhglsiuzfpk.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_347 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_348 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_349 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_350 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_351 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_352 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_353 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_354 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_355 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_356 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_357 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_347, &var_348, &var_349, &var_350, &var_351, &var_352, &var_353, &var_354, &var_355, &var_356, &global_scratch_357}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_35, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_36( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_36', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 2097152}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_36', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_sigmoid_36(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 1179648 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_36 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_36 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cdunjhkwacuurik4z6ixf3swngb2dtnqggz5b3wdvcqjkl3wbuif.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_36", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_358 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_359 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_360 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_361 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_358, &var_359, &var_360, &global_scratch_361}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_36, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_37( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_37', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 256}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_37', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_sigmoid_37(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 256 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_37 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_37 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ctgmkxd56s5k6dn3ye32ivg6jp2fdiunnkcrsiri6f2uvy4wmgsz.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_37", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_362 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_363 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_364 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_365 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_362, &var_363, &var_364, &global_scratch_365}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_37, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_38( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_38', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 512, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_38', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_38(in_ptr0, in_ptr1, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 512 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 16) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (8*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean_next, tmp5_m2_next, tmp5_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4, tmp5_mean, tmp5_m2, tmp5_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_mean = tl.where(r0_mask & xmask, tmp5_mean_next, tmp5_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_m2 = tl.where(r0_mask & xmask, tmp5_m2_next, tmp5_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5_weight = tl.where(r0_mask & xmask, tmp5_weight_next, tmp5_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8, tmp9, tmp10 = triton_helpers.welford(tmp5_mean, tmp5_m2, tmp5_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp9[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp6, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp7, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_38 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_38 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/csdvwyo6raadjgh5agexkdti5gc7kskvlb2cc5nh3g2foo2s43d3.cubin", "triton_red_fused_native_group_norm_38", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_366 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_367 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_368 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_369 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_370 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_371 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_372 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_373 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_374 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_375 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_376 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_366, &var_367, &var_368, &var_369, &var_370, &var_371, &var_372, &var_373, &var_374, &var_375, &global_scratch_376}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_38, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_per_fused_native_group_norm_39( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_per_fused_native_group_norm_39', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.persistent_reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 32, 'r0_': 16}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused_native_group_norm_39', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_per_fused_native_group_norm_39(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 32 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_numel = 16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] R0_BLOCK: tl.constexpr = 16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_offset = 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_1 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1 + 16*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (r0_1 + 16*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.load(in_ptr2 + (r0_1 + 16*x0), xmask, other=0.0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.where(xmask, tmp3, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.where(xmask, tmp4, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tl.where(xmask, tmp5, 0) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10, tmp11, tmp12 = triton_helpers.welford(tmp7, tmp8, tmp9, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp10[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tmp11[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp13, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x0), tmp14, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (8 - 1)) / (8)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_per_fused_native_group_norm_39 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_per_fused_native_group_norm_39 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cj4bp4jzrxnhmbr3e2scgoaq5td27c3fdzfqpe4mqhhudol6hpty.cubin", "triton_per_fused_native_group_norm_39", 32, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_377 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_378 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_379 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_380 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_381 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_382 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_383 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_384 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_377, &var_378, &var_379, &var_380, &var_381, &var_382, &var_383, &global_scratch_384}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_per_fused_native_group_norm_39, grid_0, grid_1, grid_2, 2, 32, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_40( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_40', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 131072}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_40', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_40(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 131072 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_40 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_40 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/conw5fxkzfoxhoyfcwfpxktjs6s35dzi7xuf64335cmzpbynf6qb.cubin", "triton_poi_fused__to_copy_convolution_40", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_385 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_386 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_387 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_388 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_385, &var_386, &var_387, &global_scratch_388}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_40, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 4194304}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1 // 8), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tl.load(in_ptr3 + (x1 // 8), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp2.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 - tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = 128*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tmp7.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = (tmp6 / tmp8) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp9 + tmp10 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = libdevice.rsqrt(tmp11) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = tmp5 * tmp12 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 * tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp15 + tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.sigmoid(tmp17) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 * tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tmp19.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp20, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cn32ldycezggq5ntm4nizx6k2eqojr6gs75ioxqikw6ydkbcighw.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_389 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_390 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_391 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_392 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_393 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_394 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_395 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_396 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_397 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_398 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_399 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_400 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_389, &var_390, &var_391, &var_392, &var_393, &var_394, &var_395, &var_396, &var_397, &var_398, &var_399, &global_scratch_400}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_41, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_sigmoid_42( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_sigmoid_42', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 1048576}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_sigmoid_42', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_sigmoid_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 589824 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_42 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_42 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cfj4tf6wgkmcgrjmlx7vtnbevgypigbvvy2derctmuvlat5k5xq6.cubin", "triton_poi_fused__to_copy_convolution_mul_sigmoid_42", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_401 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_402 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_403 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_404 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_401, &var_402, &var_403, &global_scratch_404}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_sigmoid_42, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_43( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_43', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 512, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_43', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_43(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 512 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 16) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (8*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr2 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr3 + (8*x1 + ((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp6.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean_next, tmp9_m2_next, tmp9_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8, tmp9_mean, tmp9_m2, tmp9_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_mean = tl.where(r0_mask & xmask, tmp9_mean_next, tmp9_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_m2 = tl.where(r0_mask & xmask, tmp9_m2_next, tmp9_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9_weight = tl.where(r0_mask & xmask, tmp9_weight_next, tmp9_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12, tmp13, tmp14 = triton_helpers.welford(tmp9_mean, tmp9_m2, tmp9_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp12[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp13[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = tmp14[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp9, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp10, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp11, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_43 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_43 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/ciooem3z35kwd5bzfyhf5o3vvexbszb67gobf5kuget7qhzns573.cubin", "triton_red_fused_native_group_norm_43", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_405 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_406 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_407 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_408 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_409 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_410 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_411 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_412 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_413 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_414 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_415 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_416 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_417 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_405, &var_406, &var_407, &var_408, &var_409, &var_410, &var_411, &var_412, &var_413, &var_414, &var_415, &var_416, &global_scratch_417}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_43, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename in_ptr5_type_, typename in_ptr6_type_, typename in_ptr7_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr5_type_& in_ptr5, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr6_type_& in_ptr6, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr7_type_& in_ptr7, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 4194304}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'in_ptr7': '*fp32', 'out_ptr1': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr1, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr2 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (x1 // 8), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tl.load(in_ptr5 + (x1 // 8), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp18 = tl.load(in_ptr6 + (x1), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp20 = tl.load(in_ptr7 + (x1), None, eviction_policy='evict_last') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tmp6.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 - tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp11 = 128*ks1*ks2 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp12 = tmp11.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp13 = (tmp10 / tmp12) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp14 = 1e-06 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp15 = tmp13 + tmp14 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp16 = libdevice.rsqrt(tmp15) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp17 = tmp9 * tmp16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp19 = tmp17 * tmp18 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp21 = tmp19 + tmp20 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp22 = tl.sigmoid(tmp21) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp23 = tmp21 * tmp22 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp24 = tmp23.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x2), tmp24, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cc4joqjeiqk42d5ccbwlnsic3wwcoyv45f2h7f6rpovczqalzpe2.cubin", "triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_418 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_419 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_420 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_421 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_422 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_423 = reinterpret_cast<CUdeviceptr>(in_ptr5.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_424 = reinterpret_cast<CUdeviceptr>(in_ptr6.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_425 = reinterpret_cast<CUdeviceptr>(in_ptr7.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_426 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_427 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_428 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_429 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_430 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_431 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_418, &var_419, &var_420, &var_421, &var_422, &var_423, &var_424, &var_425, &var_426, &var_427, &var_428, &var_429, &var_430, &global_scratch_431}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_44, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_out_ptr0_type_& in_out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr3_type_& in_ptr3, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr4_type_& in_ptr4, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.pointwise( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 4194304}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] min_elem_per_thread=0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, ks0, xnumel, XBLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x2 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // ks0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tl.load(in_ptr1 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp7 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp8 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last').to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tmp0 + tmp1 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp3 + tmp4 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6 = tmp2 + tmp5 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp9 = tmp7 + tmp8 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp10 = tmp6 + tmp9 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(in_out_ptr0 + (x2), tmp10, None) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cnxzgbgs6spla2etszz7nxrw2jgnxf7g2ulsrkz3crsqjjjrcefv.cubin", "triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45", 0, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_432 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_433 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_434 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_435 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_436 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_437 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_438 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_439 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_440 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_432, &var_433, &var_434, &var_435, &var_436, &var_437, &var_438, &var_439, &global_scratch_440}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_poi_fused__to_copy_add_convolution_mul_sigmoid_45, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename out_ptr0_type_, typename out_ptr1_type_, typename out_ptr2_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_red_fused_native_group_norm_46( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr0_type_& out_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr1_type_& out_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const out_ptr2_type_& out_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t ks2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t xnumel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int64_t r0_numel, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] cudaStream_t stream_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_type_& kernels_, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const std::optional<std::string>& cubin_dir_ = std::nullopt | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ){ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] /* | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] async_compile.triton('triton_red_fused_native_group_norm_46', ''' | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] import triton.language as tl | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_helpers.set_driver_to_gpu() | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton_heuristics.reduction( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] size_hints={'x': 512, 'r0_': 8192}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] reduction_hint=ReductionHint.INNER, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] filename=__file__, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]}, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_native_group_norm_46', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 3, 'backend_hash': '568CF688BAE5AE4E1C1B042E4FB0491164EEFB6E6DE7AC002517B75D90E67E79', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True} | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] @triton.jit | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] def triton_red_fused_native_group_norm_46(in_ptr0, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xnumel = 512 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rnumel = r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] xmask = xindex < xnumel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rbase = r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x0 = (xindex % 16) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x1 = xindex // 16 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] x3 = xindex | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_index = r0_offset + r0_base | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_mask = r0_index < r0_numel | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] roffset = r0_offset | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] rindex = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] r0_2 = r0_index | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp0 = tl.load(in_ptr0 + (16*ks1*ks2*((((r0_2 + 8*ks1*ks2*x0) // ks0) % 8)) + 128*ks1*ks2*x1 + (((((r0_2 + 8*ks1*ks2*x0) % ks0)) % ks0))), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp1 = tmp0.to(tl.float32) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp6, tmp7, tmp8 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp3 = tmp6[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp4 = tmp7[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tmp5 = tmp8[:, None] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr0 + (x3), tmp3, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr1 + (x3), tmp4, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] tl.store(out_ptr2 + (x3), tmp5, xmask) | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] ''', device_str='cuda') | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] */ | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_0 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_1 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] uint32_t grid_2 = 1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] if (kernels_.triton_red_fused_native_group_norm_46 == nullptr) { | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] kernels_.triton_red_fused_native_group_norm_46 = loadKernel("/tmp/torchinductor_tmanlaibaatar/ciquhvbostvhteb3y5zykyldnwmcte6juppxy5sxm5bazs5f62mt/cjh6z4avfhdhnsmc422ejqmgto2qf746pwta7xtecnwk6vibgis2.cubin", "triton_red_fused_native_group_norm_46", 192, cubin_dir_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_441 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_442 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_443 = reinterpret_cast<CUdeviceptr>(out_ptr1.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr var_444 = reinterpret_cast<CUdeviceptr>(out_ptr2.data_ptr()); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_445 = ks0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_446 = ks1; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_447 = ks2; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int var_448 = xnumel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] int32_t var_449 = r0_numel; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] CUdeviceptr global_scratch_450 = 0; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] void* kernel_args_[] = {&var_441, &var_442, &var_443, &var_444, &var_445, &var_446, &var_447, &var_448, &var_449, &global_scratch_450}; | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] launchKernel(kernels_.triton_red_fused_native_group_norm_46, grid_0, grid_1, grid_2, 16, 192, kernel_args_, stream_); | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] } | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] template <typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename out_ptr1_type_, typename kernels_type_> | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] static inline void call_triton_poi_fused__to_copy_convolution_mul_native_group_norm_sigmoid_47( | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr0_type_& in_ptr0, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr1_type_& in_ptr1, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.py:1340] [__output_code] const in_ptr2_type_& in_ptr2, | |
V0512 13:27:57.361000 3957859 site-packages/torch/_inductor/compile_fx.p |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment