chauhang/gist:3b6ddd248477ae6fbed6902163ab9f12 Secret

## gistfile1.txt
The non compiled module runs in  1190.819 microseconds
---------------------------------------------------------------------------
BackendCompilerFailed                     Traceback (most recent call last)
<ipython-input-5-b86fe1e1877f> in <cell line: 11>()
      9 compiled_model = torch.compile(model)
     10 # Let's compile it
---> 11 compiled_model(x)
     12 print(
     13     f"The compiled module runs in  {benchmark_torch_function_in_microseconds(compiled_model, x):.3f} microseconds")

50 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
   1516             return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517         else:
-> 1518             return self._call_impl(*args, **kwargs)
   1519
   1520     def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1525                 or _global_backward_pre_hooks or _global_backward_hooks
   1526                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527             return forward_call(*args, **kwargs)
   1528
   1529         try:

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in _fn(*args, **kwargs)
    326             dynamic_ctx.__enter__()
    327             try:
--> 328                 return fn(*args, **kwargs)
    329             finally:
    330                 set_eval_frame(prior)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
   1516             return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517         else:
-> 1518             return self._call_impl(*args, **kwargs)
   1519
   1520     def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1525                 or _global_backward_pre_hooks or _global_backward_hooks
   1526                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527             return forward_call(*args, **kwargs)
   1528
   1529         try:

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in catch_errors(frame, cache_entry, frame_state)
    488
    489         with compile_lock, _disable_current_modes():
--> 490             return callback(frame, cache_entry, hooks, frame_state)
    491
    492     catch_errors._torchdynamo_orig_callable = callback  # type: ignore[attr-defined]

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame(frame, cache_size, hooks, frame_state)
    639         counters["frames"]["total"] += 1
    640         try:
--> 641             result = inner_convert(frame, cache_size, hooks, frame_state)
    642             counters["frames"]["ok"] += 1
    643             return result

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _fn(*args, **kwargs)
    131         cleanup = setup_compile_debug()
    132         try:
--> 133             return fn(*args, **kwargs)
    134         finally:
    135             cleanup.close()

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame_assert(frame, cache_entry, hooks, frame_state)
    387         )
    388
--> 389         return _compile(
    390             frame.f_code,
    391             frame.f_globals,

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _compile(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)
    567     with compile_context(CompileContext(compile_id)):
    568         try:
--> 569             guarded_code = compile_inner(code, one_graph, hooks, transform)
    570             return guarded_code
    571         except (

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
    187             with torch.profiler.record_function(f"{key} (dynamo_timed)"):
    188                 t0 = time.time()
--> 189                 r = func(*args, **kwargs)
    190                 time_spent = time.time() - t0
    191             compilation_time_metrics[key].append(time_spent)

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in compile_inner(code, one_graph, hooks, transform)
    489         for attempt in itertools.count():
    490             try:
--> 491                 out_code = transform_code_object(code, transform)
    492                 orig_code_map[out_code] = code
    493                 break

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/bytecode_transformation.py in transform_code_object(code, transformations, safe)
   1026     propagate_line_nums(instructions)
   1027
-> 1028     transformations(instructions, code_options)
   1029     return clean_and_assemble_instructions(instructions, keys, code_options)[1]
   1030

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in transform(instructions, code_options)
    456         try:
    457             with tracing(tracer.output.tracing_context):
--> 458                 tracer.run()
    459         except (exc.RestartAnalysis, exc.SkipFrame):
    460             raise

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self)
   2072
   2073     def run(self):
-> 2074         super().run()
   2075
   2076     def match_nested_cell(self, name, cell):

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self)
    722                     self.instruction_pointer is not None
    723                     and not self.output.should_exit
--> 724                     and self.step()
    725                 ):
    726                     pass

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in step(self)
    686                 self.f_code.co_filename, self.lineno, self.f_code.co_name
    687             )
--> 688             getattr(self, inst.opname)(inst)
    689
    690             return inst.opname != "RETURN_VALUE"

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in RETURN_VALUE(self, inst)
   2160         )
   2161         log.debug("RETURN_VALUE triggered compile")
-> 2162         self.output.compile_subgraph(
   2163             self,
   2164             reason=GraphCompileReason(

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_subgraph(self, tx, partial_convert, reason)
    831             # optimization to generate better code in a common case
    832             self.add_output_instructions(
--> 833                 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
    834                 + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))]
    835             )

/usr/lib/python3.10/contextlib.py in inner(*args, **kwds)
     77         def inner(*args, **kwds):
     78             with self._recreate_cm():
---> 79                 return func(*args, **kwds)
     80         return inner
     81

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_and_call_fx_graph(self, tx, rv, root)
    955         )
    956
--> 957         compiled_fn = self.call_user_compiler(gm)
    958         compiled_fn = disable(compiled_fn)
    959

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
    187             with torch.profiler.record_function(f"{key} (dynamo_timed)"):
    188                 t0 = time.time()
--> 189                 r = func(*args, **kwargs)
    190                 time_spent = time.time() - t0
    191             compilation_time_metrics[key].append(time_spent)

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm)
   1022             unimplemented_with_warning(e, self.root_tx.f_code, msg)
   1023         except Exception as e:
-> 1024             raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
   1025                 e.__traceback__
   1026             ) from None

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm)
   1007             if config.verify_correctness:
   1008                 compiler_fn = WrapperBackend(compiler_fn)
-> 1009             compiled_fn = compiler_fn(gm, self.example_inputs())
   1010             _step_logger()(logging.INFO, f"done compiler function {name}")
   1011             assert callable(compiled_fn), "compiler_fn did not return callable"

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_dynamo.py in debug_wrapper(gm, example_inputs, **kwargs)
    115                     raise
    116         else:
--> 117             compiled_gm = compiler_fn(gm, example_inputs)
    118
    119         return compiled_gm

/usr/local/lib/python3.10/dist-packages/torch/__init__.py in __call__(self, model_, inputs_)
   1566         from torch._inductor.compile_fx import compile_fx
   1567
-> 1568         return compile_fx(model_, inputs_, config_patches=self.config)
   1569
   1570     def get_compiler_config(self):

/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx(model_, example_inputs_, inner_compile, config_patches, decompositions)
   1148         tracing_context
   1149     ), compiled_autograd.disable():
-> 1150         return aot_autograd(
   1151             fw_compiler=fw_compiler,
   1152             bw_compiler=bw_compiler,

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/backends/common.py in compiler_fn(gm, example_inputs)
     53             # NB: NOT cloned!
     54             with enable_aot_logging(), patch_config:
---> 55                 cg = aot_module_simplified(gm, example_inputs, **kwargs)
     56                 counters["aot_autograd"]["ok"] += 1
     57                 return disable(cg)

/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_module_simplified(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)
   3889
   3890     with compiled_autograd.disable():
-> 3891         compiled_fn = create_aot_dispatcher_function(
   3892             functional_call,
   3893             full_args,

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
    187             with torch.profiler.record_function(f"{key} (dynamo_timed)"):
    188                 t0 = time.time()
--> 189                 r = func(*args, **kwargs)
    190                 time_spent = time.time() - t0
    191             compilation_time_metrics[key].append(time_spent)

/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in create_aot_dispatcher_function(flat_fn, flat_args, aot_config)
   3427         # You can put more passes here
   3428
-> 3429         compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
   3430         if aot_config.is_export:
   3431

/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_dedupe(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)
   2210
   2211     if ok:
-> 2212         return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
   2213
   2214     # export path: ban duplicate inputs for now, add later if requested.

/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_synthetic_base(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)
   2390     # Happy path: we don't need synthetic bases
   2391     if synthetic_base_info is None:
-> 2392         return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
   2393
   2394     # export path: ban synthetic bases for now, add later if requested.

/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_dispatch_autograd(flat_fn, flat_args, aot_config, fw_metadata)
   2915
   2916             with TracingContext.report_output_strides() as fwd_output_strides:
-> 2917                 compiled_fw_func = aot_config.fw_compiler(
   2918                     fw_module, adjusted_flat_args
   2919                 )

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
    187             with torch.profiler.record_function(f"{key} (dynamo_timed)"):
    188                 t0 = time.time()
--> 189                 r = func(*args, **kwargs)
    190                 time_spent = time.time() - t0
    191             compilation_time_metrics[key].append(time_spent)

/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fw_compiler_base(model, example_inputs, is_inference)
   1090             }
   1091
-> 1092         return inner_compile(
   1093             model,
   1094             example_inputs,

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_aot.py in debug_wrapper(gm, example_inputs, **kwargs)
     78             # Call the compiler_fn - which is either aot_autograd or inductor
     79             # with fake inputs
---> 80             inner_compiled_fn = compiler_fn(gm, example_inputs)
     81         except Exception as e:
     82             # TODO: Failures here are troublesome because no real inputs,

/usr/local/lib/python3.10/dist-packages/torch/_inductor/debug.py in inner(*args, **kwargs)
    226         def inner(*args, **kwargs):
    227             with DebugContext():
--> 228                 return fn(*args, **kwargs)
    229
    230         return wrap_compiler_debug(inner, compiler_name="inductor")

/usr/lib/python3.10/contextlib.py in inner(*args, **kwds)
     77         def inner(*args, **kwds):
     78             with self._recreate_cm():
---> 79                 return func(*args, **kwds)
     80         return inner
     81

/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in newFunction(*args, **kwargs)
     52             @wraps(old_func)
     53             def newFunction(*args, **kwargs):
---> 54                 return old_func(*args, **kwargs)
     55
     56             return newFunction

/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx_inner(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt)
    339     }
    340
--> 341     compiled_graph: CompiledFxGraph = fx_codegen_and_compile(
    342         *graph_args, **graph_kwargs  # type: ignore[arg-type]
    343     )

/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fx_codegen_and_compile(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt)
    563                     else:
    564                         context.output_strides.append(None)
--> 565             compiled_fn = graph.compile_to_fn()
    566
    567             if graph.disable_cudagraphs:

/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_fn(self)
    968             return AotCodeCache.compile(self, code, cuda=self.cuda)
    969         else:
--> 970             return self.compile_to_module().call
    971
    972     def get_output_names(self):

/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
    187             with torch.profiler.record_function(f"{key} (dynamo_timed)"):
    188                 t0 = time.time()
--> 189                 r = func(*args, **kwargs)
    190                 time_spent = time.time() - t0
    191             compilation_time_metrics[key].append(time_spent)

/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_module(self)
    939         linemap = [(line_no, node.stack_trace) for line_no, node in linemap]
    940         key, path = PyCodeCache.write(code)
--> 941         mod = PyCodeCache.load_by_key_path(key, path, linemap=linemap)
    942         self.cache_key = key
    943         self.cache_path = path

/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in load_by_key_path(cls, key, path, linemap)
   1137                 mod.__file__ = path
   1138                 mod.key = key
-> 1139                 exec(code, mod.__dict__, mod.__dict__)
   1140                 sys.modules[mod.__name__] = mod
   1141                 # another thread might set this first

/tmp/torchinductor_root/36/c36twgeyj3id4letfelt2vbacz7abhe244te45y6bykwh6ycmxke.py in <module>
     50
     51
---> 52 async_compile.wait(globals())
     53 del async_compile
     54

/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in wait(self, scope)
   1416                     pbar.set_postfix_str(key)
   1417                 if isinstance(result, (Future, TritonFuture)):
-> 1418                     scope[key] = result.result()
   1419                     pbar.update(1)
   1420

/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in result(self)
   1275             return self.kernel
   1276         # If the worker failed this will throw an exception.
-> 1277         self.future.result()
   1278         kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code)
   1279         latency = time() - t0

/usr/lib/python3.10/concurrent/futures/_base.py in result(self, timeout)
    456                     raise CancelledError()
    457                 elif self._state == FINISHED:
--> 458                     return self.__get_result()
    459                 else:
    460                     raise TimeoutError()

/usr/lib/python3.10/concurrent/futures/_base.py in __get_result(self)
    401         if self._exception:
    402             try:
--> 403                 raise self._exception
    404             finally:
    405                 # Break a reference cycle with the exception in self._exception

BackendCompilerFailed: backend='inductor' raised:
AssertionError: libcuda.so cannot found!
	The non compiled module runs in 1190.819 microseconds
	---------------------------------------------------------------------------
	BackendCompilerFailed Traceback (most recent call last)
	<ipython-input-5-b86fe1e1877f> in <cell line: 11>()
	9 compiled_model = torch.compile(model)
	10 # Let's compile it
	---> 11 compiled_model(x)
	12 print(
	13 f"The compiled module runs in {benchmark_torch_function_in_microseconds(compiled_model, x):.3f} microseconds")

	50 frames
	/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, args, *kwargs)
	1516 return self._compiled_call_impl(args, *kwargs) # type: ignore[misc]
	1517 else:
	-> 1518 return self._call_impl(args, *kwargs)
	1519
	1520 def _call_impl(self, args, *kwargs):

	/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, args, *kwargs)
	1525 or _global_backward_pre_hooks or _global_backward_hooks
	1526 or _global_forward_hooks or _global_forward_pre_hooks):
	-> 1527 return forward_call(args, *kwargs)
	1528
	1529 try:

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in _fn(args, *kwargs)
	326 dynamic_ctx.__enter__()
	327 try:
	--> 328 return fn(args, *kwargs)
	329 finally:
	330 set_eval_frame(prior)

	/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, args, *kwargs)
	1516 return self._compiled_call_impl(args, *kwargs) # type: ignore[misc]
	1517 else:
	-> 1518 return self._call_impl(args, *kwargs)
	1519
	1520 def _call_impl(self, args, *kwargs):

	/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, args, *kwargs)
	1525 or _global_backward_pre_hooks or _global_backward_hooks
	1526 or _global_forward_hooks or _global_forward_pre_hooks):
	-> 1527 return forward_call(args, *kwargs)
	1528
	1529 try:

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in catch_errors(frame, cache_entry, frame_state)
	488
	489 with compile_lock, _disable_current_modes():
	--> 490 return callback(frame, cache_entry, hooks, frame_state)
	491
	492 catch_errors._torchdynamo_orig_callable = callback # type: ignore[attr-defined]

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame(frame, cache_size, hooks, frame_state)
	639 counters["frames"]["total"] += 1
	640 try:
	--> 641 result = inner_convert(frame, cache_size, hooks, frame_state)
	642 counters["frames"]["ok"] += 1
	643 return result

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _fn(args, *kwargs)
	131 cleanup = setup_compile_debug()
	132 try:
	--> 133 return fn(args, *kwargs)
	134 finally:
	135 cleanup.close()

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame_assert(frame, cache_entry, hooks, frame_state)
	387 )
	388
	--> 389 return _compile(
	390 frame.f_code,
	391 frame.f_globals,

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _compile(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)
	567 with compile_context(CompileContext(compile_id)):
	568 try:
	--> 569 guarded_code = compile_inner(code, one_graph, hooks, transform)
	570 return guarded_code
	571 except (

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(args, *kwargs)
	187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
	188 t0 = time.time()
	--> 189 r = func(args, *kwargs)
	190 time_spent = time.time() - t0
	191 compilation_time_metrics[key].append(time_spent)

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in compile_inner(code, one_graph, hooks, transform)
	489 for attempt in itertools.count():
	490 try:
	--> 491 out_code = transform_code_object(code, transform)
	492 orig_code_map[out_code] = code
	493 break

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/bytecode_transformation.py in transform_code_object(code, transformations, safe)
	1026 propagate_line_nums(instructions)
	1027
	-> 1028 transformations(instructions, code_options)
	1029 return clean_and_assemble_instructions(instructions, keys, code_options)[1]
	1030

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in transform(instructions, code_options)
	456 try:
	457 with tracing(tracer.output.tracing_context):
	--> 458 tracer.run()
	459 except (exc.RestartAnalysis, exc.SkipFrame):
	460 raise

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self)
	2072
	2073 def run(self):
	-> 2074 super().run()
	2075
	2076 def match_nested_cell(self, name, cell):

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self)
	722 self.instruction_pointer is not None
	723 and not self.output.should_exit
	--> 724 and self.step()
	725 ):
	726 pass

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in step(self)
	686 self.f_code.co_filename, self.lineno, self.f_code.co_name
	687 )
	--> 688 getattr(self, inst.opname)(inst)
	689
	690 return inst.opname != "RETURN_VALUE"

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in RETURN_VALUE(self, inst)
	2160 )
	2161 log.debug("RETURN_VALUE triggered compile")
	-> 2162 self.output.compile_subgraph(
	2163 self,
	2164 reason=GraphCompileReason(

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_subgraph(self, tx, partial_convert, reason)
	831 # optimization to generate better code in a common case
	832 self.add_output_instructions(
	--> 833 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
	834 + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))]
	835 )

	/usr/lib/python3.10/contextlib.py in inner(args, *kwds)
	77 def inner(args, *kwds):
	78 with self._recreate_cm():
	---> 79 return func(args, *kwds)
	80 return inner
	81

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_and_call_fx_graph(self, tx, rv, root)
	955 )
	956
	--> 957 compiled_fn = self.call_user_compiler(gm)
	958 compiled_fn = disable(compiled_fn)
	959

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(args, *kwargs)
	187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
	188 t0 = time.time()
	--> 189 r = func(args, *kwargs)
	190 time_spent = time.time() - t0
	191 compilation_time_metrics[key].append(time_spent)

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm)
	1022 unimplemented_with_warning(e, self.root_tx.f_code, msg)
	1023 except Exception as e:
	-> 1024 raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
	1025 e.__traceback__
	1026 ) from None

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm)
	1007 if config.verify_correctness:
	1008 compiler_fn = WrapperBackend(compiler_fn)
	-> 1009 compiled_fn = compiler_fn(gm, self.example_inputs())
	1010 _step_logger()(logging.INFO, f"done compiler function {name}")
	1011 assert callable(compiled_fn), "compiler_fn did not return callable"

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_dynamo.py in debug_wrapper(gm, example_inputs, **kwargs)
	115 raise
	116 else:
	--> 117 compiled_gm = compiler_fn(gm, example_inputs)
	118
	119 return compiled_gm

	/usr/local/lib/python3.10/dist-packages/torch/__init__.py in __call__(self, model_, inputs_)
	1566 from torch._inductor.compile_fx import compile_fx
	1567
	-> 1568 return compile_fx(model_, inputs_, config_patches=self.config)
	1569
	1570 def get_compiler_config(self):

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx(model_, example_inputs_, inner_compile, config_patches, decompositions)
	1148 tracing_context
	1149 ), compiled_autograd.disable():
	-> 1150 return aot_autograd(
	1151 fw_compiler=fw_compiler,
	1152 bw_compiler=bw_compiler,

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/backends/common.py in compiler_fn(gm, example_inputs)
	53 # NB: NOT cloned!
	54 with enable_aot_logging(), patch_config:
	---> 55 cg = aot_module_simplified(gm, example_inputs, **kwargs)
	56 counters["aot_autograd"]["ok"] += 1
	57 return disable(cg)

	/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_module_simplified(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)
	3889
	3890 with compiled_autograd.disable():
	-> 3891 compiled_fn = create_aot_dispatcher_function(
	3892 functional_call,
	3893 full_args,

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(args, *kwargs)
	187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
	188 t0 = time.time()
	--> 189 r = func(args, *kwargs)
	190 time_spent = time.time() - t0
	191 compilation_time_metrics[key].append(time_spent)

	/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in create_aot_dispatcher_function(flat_fn, flat_args, aot_config)
	3427 # You can put more passes here
	3428
	-> 3429 compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
	3430 if aot_config.is_export:
	3431

	/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_dedupe(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)
	2210
	2211 if ok:
	-> 2212 return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
	2213
	2214 # export path: ban duplicate inputs for now, add later if requested.

	/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_synthetic_base(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)
	2390 # Happy path: we don't need synthetic bases
	2391 if synthetic_base_info is None:
	-> 2392 return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
	2393
	2394 # export path: ban synthetic bases for now, add later if requested.

	/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_dispatch_autograd(flat_fn, flat_args, aot_config, fw_metadata)
	2915
	2916 with TracingContext.report_output_strides() as fwd_output_strides:
	-> 2917 compiled_fw_func = aot_config.fw_compiler(
	2918 fw_module, adjusted_flat_args
	2919 )

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(args, *kwargs)
	187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
	188 t0 = time.time()
	--> 189 r = func(args, *kwargs)
	190 time_spent = time.time() - t0
	191 compilation_time_metrics[key].append(time_spent)

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fw_compiler_base(model, example_inputs, is_inference)
	1090 }
	1091
	-> 1092 return inner_compile(
	1093 model,
	1094 example_inputs,

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_aot.py in debug_wrapper(gm, example_inputs, **kwargs)
	78 # Call the compiler_fn - which is either aot_autograd or inductor
	79 # with fake inputs
	---> 80 inner_compiled_fn = compiler_fn(gm, example_inputs)
	81 except Exception as e:
	82 # TODO: Failures here are troublesome because no real inputs,

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/debug.py in inner(args, *kwargs)
	226 def inner(args, *kwargs):
	227 with DebugContext():
	--> 228 return fn(args, *kwargs)
	229
	230 return wrap_compiler_debug(inner, compiler_name="inductor")

	/usr/lib/python3.10/contextlib.py in inner(args, *kwds)
	77 def inner(args, *kwds):
	78 with self._recreate_cm():
	---> 79 return func(args, *kwds)
	80 return inner
	81

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in newFunction(args, *kwargs)
	52 @wraps(old_func)
	53 def newFunction(args, *kwargs):
	---> 54 return old_func(args, *kwargs)
	55
	56 return newFunction

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx_inner(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt)
	339 }
	340
	--> 341 compiled_graph: CompiledFxGraph = fx_codegen_and_compile(
	342 graph_args, *graph_kwargs # type: ignore[arg-type]
	343 )

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fx_codegen_and_compile(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt)
	563 else:
	564 context.output_strides.append(None)
	--> 565 compiled_fn = graph.compile_to_fn()
	566
	567 if graph.disable_cudagraphs:

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_fn(self)
	968 return AotCodeCache.compile(self, code, cuda=self.cuda)
	969 else:
	--> 970 return self.compile_to_module().call
	971
	972 def get_output_names(self):

	/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(args, *kwargs)
	187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
	188 t0 = time.time()
	--> 189 r = func(args, *kwargs)
	190 time_spent = time.time() - t0
	191 compilation_time_metrics[key].append(time_spent)

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_module(self)
	939 linemap = [(line_no, node.stack_trace) for line_no, node in linemap]
	940 key, path = PyCodeCache.write(code)
	--> 941 mod = PyCodeCache.load_by_key_path(key, path, linemap=linemap)
	942 self.cache_key = key
	943 self.cache_path = path

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in load_by_key_path(cls, key, path, linemap)
	1137 mod.__file__ = path
	1138 mod.key = key
	-> 1139 exec(code, mod.__dict__, mod.__dict__)
	1140 sys.modules[mod.__name__] = mod
	1141 # another thread might set this first

	/tmp/torchinductor_root/36/c36twgeyj3id4letfelt2vbacz7abhe244te45y6bykwh6ycmxke.py in <module>
	50
	51
	---> 52 async_compile.wait(globals())
	53 del async_compile
	54

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in wait(self, scope)
	1416 pbar.set_postfix_str(key)
	1417 if isinstance(result, (Future, TritonFuture)):
	-> 1418 scope[key] = result.result()
	1419 pbar.update(1)
	1420

	/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in result(self)
	1275 return self.kernel
	1276 # If the worker failed this will throw an exception.
	-> 1277 self.future.result()
	1278 kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code)
	1279 latency = time() - t0

	/usr/lib/python3.10/concurrent/futures/_base.py in result(self, timeout)
	456 raise CancelledError()
	457 elif self._state == FINISHED:
	--> 458 return self.__get_result()
	459 else:
	460 raise TimeoutError()

	/usr/lib/python3.10/concurrent/futures/_base.py in __get_result(self)
	401 if self._exception:
	402 try:
	--> 403 raise self._exception
	404 finally:
	405 # Break a reference cycle with the exception in self._exception

	BackendCompilerFailed: backend='inductor' raised:
	AssertionError: libcuda.so cannot found!