Skip to content

Instantly share code, notes, and snippets.

@chauhang
Created November 11, 2023 16:48
Show Gist options
  • Save chauhang/3b6ddd248477ae6fbed6902163ab9f12 to your computer and use it in GitHub Desktop.
Save chauhang/3b6ddd248477ae6fbed6902163ab9f12 to your computer and use it in GitHub Desktop.
libcuda.so not found error - SDPA Tutorial
The non compiled module runs in 1190.819 microseconds
---------------------------------------------------------------------------
BackendCompilerFailed Traceback (most recent call last)
<ipython-input-5-b86fe1e1877f> in <cell line: 11>()
9 compiled_model = torch.compile(model)
10 # Let's compile it
---> 11 compiled_model(x)
12 print(
13 f"The compiled module runs in {benchmark_torch_function_in_microseconds(compiled_model, x):.3f} microseconds")
50 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1528
1529 try:
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in _fn(*args, **kwargs)
326 dynamic_ctx.__enter__()
327 try:
--> 328 return fn(*args, **kwargs)
329 finally:
330 set_eval_frame(prior)
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1528
1529 try:
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in catch_errors(frame, cache_entry, frame_state)
488
489 with compile_lock, _disable_current_modes():
--> 490 return callback(frame, cache_entry, hooks, frame_state)
491
492 catch_errors._torchdynamo_orig_callable = callback # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame(frame, cache_size, hooks, frame_state)
639 counters["frames"]["total"] += 1
640 try:
--> 641 result = inner_convert(frame, cache_size, hooks, frame_state)
642 counters["frames"]["ok"] += 1
643 return result
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _fn(*args, **kwargs)
131 cleanup = setup_compile_debug()
132 try:
--> 133 return fn(*args, **kwargs)
134 finally:
135 cleanup.close()
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame_assert(frame, cache_entry, hooks, frame_state)
387 )
388
--> 389 return _compile(
390 frame.f_code,
391 frame.f_globals,
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _compile(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)
567 with compile_context(CompileContext(compile_id)):
568 try:
--> 569 guarded_code = compile_inner(code, one_graph, hooks, transform)
570 return guarded_code
571 except (
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
188 t0 = time.time()
--> 189 r = func(*args, **kwargs)
190 time_spent = time.time() - t0
191 compilation_time_metrics[key].append(time_spent)
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in compile_inner(code, one_graph, hooks, transform)
489 for attempt in itertools.count():
490 try:
--> 491 out_code = transform_code_object(code, transform)
492 orig_code_map[out_code] = code
493 break
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/bytecode_transformation.py in transform_code_object(code, transformations, safe)
1026 propagate_line_nums(instructions)
1027
-> 1028 transformations(instructions, code_options)
1029 return clean_and_assemble_instructions(instructions, keys, code_options)[1]
1030
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in transform(instructions, code_options)
456 try:
457 with tracing(tracer.output.tracing_context):
--> 458 tracer.run()
459 except (exc.RestartAnalysis, exc.SkipFrame):
460 raise
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self)
2072
2073 def run(self):
-> 2074 super().run()
2075
2076 def match_nested_cell(self, name, cell):
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self)
722 self.instruction_pointer is not None
723 and not self.output.should_exit
--> 724 and self.step()
725 ):
726 pass
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in step(self)
686 self.f_code.co_filename, self.lineno, self.f_code.co_name
687 )
--> 688 getattr(self, inst.opname)(inst)
689
690 return inst.opname != "RETURN_VALUE"
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in RETURN_VALUE(self, inst)
2160 )
2161 log.debug("RETURN_VALUE triggered compile")
-> 2162 self.output.compile_subgraph(
2163 self,
2164 reason=GraphCompileReason(
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_subgraph(self, tx, partial_convert, reason)
831 # optimization to generate better code in a common case
832 self.add_output_instructions(
--> 833 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
834 + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))]
835 )
/usr/lib/python3.10/contextlib.py in inner(*args, **kwds)
77 def inner(*args, **kwds):
78 with self._recreate_cm():
---> 79 return func(*args, **kwds)
80 return inner
81
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_and_call_fx_graph(self, tx, rv, root)
955 )
956
--> 957 compiled_fn = self.call_user_compiler(gm)
958 compiled_fn = disable(compiled_fn)
959
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
188 t0 = time.time()
--> 189 r = func(*args, **kwargs)
190 time_spent = time.time() - t0
191 compilation_time_metrics[key].append(time_spent)
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm)
1022 unimplemented_with_warning(e, self.root_tx.f_code, msg)
1023 except Exception as e:
-> 1024 raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
1025 e.__traceback__
1026 ) from None
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm)
1007 if config.verify_correctness:
1008 compiler_fn = WrapperBackend(compiler_fn)
-> 1009 compiled_fn = compiler_fn(gm, self.example_inputs())
1010 _step_logger()(logging.INFO, f"done compiler function {name}")
1011 assert callable(compiled_fn), "compiler_fn did not return callable"
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_dynamo.py in debug_wrapper(gm, example_inputs, **kwargs)
115 raise
116 else:
--> 117 compiled_gm = compiler_fn(gm, example_inputs)
118
119 return compiled_gm
/usr/local/lib/python3.10/dist-packages/torch/__init__.py in __call__(self, model_, inputs_)
1566 from torch._inductor.compile_fx import compile_fx
1567
-> 1568 return compile_fx(model_, inputs_, config_patches=self.config)
1569
1570 def get_compiler_config(self):
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx(model_, example_inputs_, inner_compile, config_patches, decompositions)
1148 tracing_context
1149 ), compiled_autograd.disable():
-> 1150 return aot_autograd(
1151 fw_compiler=fw_compiler,
1152 bw_compiler=bw_compiler,
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/backends/common.py in compiler_fn(gm, example_inputs)
53 # NB: NOT cloned!
54 with enable_aot_logging(), patch_config:
---> 55 cg = aot_module_simplified(gm, example_inputs, **kwargs)
56 counters["aot_autograd"]["ok"] += 1
57 return disable(cg)
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_module_simplified(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)
3889
3890 with compiled_autograd.disable():
-> 3891 compiled_fn = create_aot_dispatcher_function(
3892 functional_call,
3893 full_args,
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
188 t0 = time.time()
--> 189 r = func(*args, **kwargs)
190 time_spent = time.time() - t0
191 compilation_time_metrics[key].append(time_spent)
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in create_aot_dispatcher_function(flat_fn, flat_args, aot_config)
3427 # You can put more passes here
3428
-> 3429 compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
3430 if aot_config.is_export:
3431
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_dedupe(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)
2210
2211 if ok:
-> 2212 return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
2213
2214 # export path: ban duplicate inputs for now, add later if requested.
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_synthetic_base(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)
2390 # Happy path: we don't need synthetic bases
2391 if synthetic_base_info is None:
-> 2392 return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
2393
2394 # export path: ban synthetic bases for now, add later if requested.
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_dispatch_autograd(flat_fn, flat_args, aot_config, fw_metadata)
2915
2916 with TracingContext.report_output_strides() as fwd_output_strides:
-> 2917 compiled_fw_func = aot_config.fw_compiler(
2918 fw_module, adjusted_flat_args
2919 )
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
188 t0 = time.time()
--> 189 r = func(*args, **kwargs)
190 time_spent = time.time() - t0
191 compilation_time_metrics[key].append(time_spent)
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fw_compiler_base(model, example_inputs, is_inference)
1090 }
1091
-> 1092 return inner_compile(
1093 model,
1094 example_inputs,
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_aot.py in debug_wrapper(gm, example_inputs, **kwargs)
78 # Call the compiler_fn - which is either aot_autograd or inductor
79 # with fake inputs
---> 80 inner_compiled_fn = compiler_fn(gm, example_inputs)
81 except Exception as e:
82 # TODO: Failures here are troublesome because no real inputs,
/usr/local/lib/python3.10/dist-packages/torch/_inductor/debug.py in inner(*args, **kwargs)
226 def inner(*args, **kwargs):
227 with DebugContext():
--> 228 return fn(*args, **kwargs)
229
230 return wrap_compiler_debug(inner, compiler_name="inductor")
/usr/lib/python3.10/contextlib.py in inner(*args, **kwds)
77 def inner(*args, **kwds):
78 with self._recreate_cm():
---> 79 return func(*args, **kwds)
80 return inner
81
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in newFunction(*args, **kwargs)
52 @wraps(old_func)
53 def newFunction(*args, **kwargs):
---> 54 return old_func(*args, **kwargs)
55
56 return newFunction
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx_inner(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt)
339 }
340
--> 341 compiled_graph: CompiledFxGraph = fx_codegen_and_compile(
342 *graph_args, **graph_kwargs # type: ignore[arg-type]
343 )
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fx_codegen_and_compile(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt)
563 else:
564 context.output_strides.append(None)
--> 565 compiled_fn = graph.compile_to_fn()
566
567 if graph.disable_cudagraphs:
/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_fn(self)
968 return AotCodeCache.compile(self, code, cuda=self.cuda)
969 else:
--> 970 return self.compile_to_module().call
971
972 def get_output_names(self):
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs)
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"):
188 t0 = time.time()
--> 189 r = func(*args, **kwargs)
190 time_spent = time.time() - t0
191 compilation_time_metrics[key].append(time_spent)
/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_module(self)
939 linemap = [(line_no, node.stack_trace) for line_no, node in linemap]
940 key, path = PyCodeCache.write(code)
--> 941 mod = PyCodeCache.load_by_key_path(key, path, linemap=linemap)
942 self.cache_key = key
943 self.cache_path = path
/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in load_by_key_path(cls, key, path, linemap)
1137 mod.__file__ = path
1138 mod.key = key
-> 1139 exec(code, mod.__dict__, mod.__dict__)
1140 sys.modules[mod.__name__] = mod
1141 # another thread might set this first
/tmp/torchinductor_root/36/c36twgeyj3id4letfelt2vbacz7abhe244te45y6bykwh6ycmxke.py in <module>
50
51
---> 52 async_compile.wait(globals())
53 del async_compile
54
/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in wait(self, scope)
1416 pbar.set_postfix_str(key)
1417 if isinstance(result, (Future, TritonFuture)):
-> 1418 scope[key] = result.result()
1419 pbar.update(1)
1420
/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in result(self)
1275 return self.kernel
1276 # If the worker failed this will throw an exception.
-> 1277 self.future.result()
1278 kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code)
1279 latency = time() - t0
/usr/lib/python3.10/concurrent/futures/_base.py in result(self, timeout)
456 raise CancelledError()
457 elif self._state == FINISHED:
--> 458 return self.__get_result()
459 else:
460 raise TimeoutError()
/usr/lib/python3.10/concurrent/futures/_base.py in __get_result(self)
401 if self._exception:
402 try:
--> 403 raise self._exception
404 finally:
405 # Break a reference cycle with the exception in self._exception
BackendCompilerFailed: backend='inductor' raised:
AssertionError: libcuda.so cannot found!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment