Created
November 11, 2023 16:48
-
-
Save chauhang/3b6ddd248477ae6fbed6902163ab9f12 to your computer and use it in GitHub Desktop.
libcuda.so not found error - SDPA Tutorial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The non compiled module runs in 1190.819 microseconds | |
--------------------------------------------------------------------------- | |
BackendCompilerFailed Traceback (most recent call last) | |
<ipython-input-5-b86fe1e1877f> in <cell line: 11>() | |
9 compiled_model = torch.compile(model) | |
10 # Let's compile it | |
---> 11 compiled_model(x) | |
12 print( | |
13 f"The compiled module runs in {benchmark_torch_function_in_microseconds(compiled_model, x):.3f} microseconds") | |
50 frames | |
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs) | |
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] | |
1517 else: | |
-> 1518 return self._call_impl(*args, **kwargs) | |
1519 | |
1520 def _call_impl(self, *args, **kwargs): | |
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs) | |
1525 or _global_backward_pre_hooks or _global_backward_hooks | |
1526 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1527 return forward_call(*args, **kwargs) | |
1528 | |
1529 try: | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in _fn(*args, **kwargs) | |
326 dynamic_ctx.__enter__() | |
327 try: | |
--> 328 return fn(*args, **kwargs) | |
329 finally: | |
330 set_eval_frame(prior) | |
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs) | |
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] | |
1517 else: | |
-> 1518 return self._call_impl(*args, **kwargs) | |
1519 | |
1520 def _call_impl(self, *args, **kwargs): | |
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs) | |
1525 or _global_backward_pre_hooks or _global_backward_hooks | |
1526 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1527 return forward_call(*args, **kwargs) | |
1528 | |
1529 try: | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py in catch_errors(frame, cache_entry, frame_state) | |
488 | |
489 with compile_lock, _disable_current_modes(): | |
--> 490 return callback(frame, cache_entry, hooks, frame_state) | |
491 | |
492 catch_errors._torchdynamo_orig_callable = callback # type: ignore[attr-defined] | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame(frame, cache_size, hooks, frame_state) | |
639 counters["frames"]["total"] += 1 | |
640 try: | |
--> 641 result = inner_convert(frame, cache_size, hooks, frame_state) | |
642 counters["frames"]["ok"] += 1 | |
643 return result | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _fn(*args, **kwargs) | |
131 cleanup = setup_compile_debug() | |
132 try: | |
--> 133 return fn(*args, **kwargs) | |
134 finally: | |
135 cleanup.close() | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _convert_frame_assert(frame, cache_entry, hooks, frame_state) | |
387 ) | |
388 | |
--> 389 return _compile( | |
390 frame.f_code, | |
391 frame.f_globals, | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in _compile(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id) | |
567 with compile_context(CompileContext(compile_id)): | |
568 try: | |
--> 569 guarded_code = compile_inner(code, one_graph, hooks, transform) | |
570 return guarded_code | |
571 except ( | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs) | |
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"): | |
188 t0 = time.time() | |
--> 189 r = func(*args, **kwargs) | |
190 time_spent = time.time() - t0 | |
191 compilation_time_metrics[key].append(time_spent) | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in compile_inner(code, one_graph, hooks, transform) | |
489 for attempt in itertools.count(): | |
490 try: | |
--> 491 out_code = transform_code_object(code, transform) | |
492 orig_code_map[out_code] = code | |
493 break | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/bytecode_transformation.py in transform_code_object(code, transformations, safe) | |
1026 propagate_line_nums(instructions) | |
1027 | |
-> 1028 transformations(instructions, code_options) | |
1029 return clean_and_assemble_instructions(instructions, keys, code_options)[1] | |
1030 | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py in transform(instructions, code_options) | |
456 try: | |
457 with tracing(tracer.output.tracing_context): | |
--> 458 tracer.run() | |
459 except (exc.RestartAnalysis, exc.SkipFrame): | |
460 raise | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self) | |
2072 | |
2073 def run(self): | |
-> 2074 super().run() | |
2075 | |
2076 def match_nested_cell(self, name, cell): | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in run(self) | |
722 self.instruction_pointer is not None | |
723 and not self.output.should_exit | |
--> 724 and self.step() | |
725 ): | |
726 pass | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in step(self) | |
686 self.f_code.co_filename, self.lineno, self.f_code.co_name | |
687 ) | |
--> 688 getattr(self, inst.opname)(inst) | |
689 | |
690 return inst.opname != "RETURN_VALUE" | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/symbolic_convert.py in RETURN_VALUE(self, inst) | |
2160 ) | |
2161 log.debug("RETURN_VALUE triggered compile") | |
-> 2162 self.output.compile_subgraph( | |
2163 self, | |
2164 reason=GraphCompileReason( | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_subgraph(self, tx, partial_convert, reason) | |
831 # optimization to generate better code in a common case | |
832 self.add_output_instructions( | |
--> 833 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root) | |
834 + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))] | |
835 ) | |
/usr/lib/python3.10/contextlib.py in inner(*args, **kwds) | |
77 def inner(*args, **kwds): | |
78 with self._recreate_cm(): | |
---> 79 return func(*args, **kwds) | |
80 return inner | |
81 | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in compile_and_call_fx_graph(self, tx, rv, root) | |
955 ) | |
956 | |
--> 957 compiled_fn = self.call_user_compiler(gm) | |
958 compiled_fn = disable(compiled_fn) | |
959 | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs) | |
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"): | |
188 t0 = time.time() | |
--> 189 r = func(*args, **kwargs) | |
190 time_spent = time.time() - t0 | |
191 compilation_time_metrics[key].append(time_spent) | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm) | |
1022 unimplemented_with_warning(e, self.root_tx.f_code, msg) | |
1023 except Exception as e: | |
-> 1024 raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( | |
1025 e.__traceback__ | |
1026 ) from None | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/output_graph.py in call_user_compiler(self, gm) | |
1007 if config.verify_correctness: | |
1008 compiler_fn = WrapperBackend(compiler_fn) | |
-> 1009 compiled_fn = compiler_fn(gm, self.example_inputs()) | |
1010 _step_logger()(logging.INFO, f"done compiler function {name}") | |
1011 assert callable(compiled_fn), "compiler_fn did not return callable" | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_dynamo.py in debug_wrapper(gm, example_inputs, **kwargs) | |
115 raise | |
116 else: | |
--> 117 compiled_gm = compiler_fn(gm, example_inputs) | |
118 | |
119 return compiled_gm | |
/usr/local/lib/python3.10/dist-packages/torch/__init__.py in __call__(self, model_, inputs_) | |
1566 from torch._inductor.compile_fx import compile_fx | |
1567 | |
-> 1568 return compile_fx(model_, inputs_, config_patches=self.config) | |
1569 | |
1570 def get_compiler_config(self): | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx(model_, example_inputs_, inner_compile, config_patches, decompositions) | |
1148 tracing_context | |
1149 ), compiled_autograd.disable(): | |
-> 1150 return aot_autograd( | |
1151 fw_compiler=fw_compiler, | |
1152 bw_compiler=bw_compiler, | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/backends/common.py in compiler_fn(gm, example_inputs) | |
53 # NB: NOT cloned! | |
54 with enable_aot_logging(), patch_config: | |
---> 55 cg = aot_module_simplified(gm, example_inputs, **kwargs) | |
56 counters["aot_autograd"]["ok"] += 1 | |
57 return disable(cg) | |
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_module_simplified(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler) | |
3889 | |
3890 with compiled_autograd.disable(): | |
-> 3891 compiled_fn = create_aot_dispatcher_function( | |
3892 functional_call, | |
3893 full_args, | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs) | |
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"): | |
188 t0 = time.time() | |
--> 189 r = func(*args, **kwargs) | |
190 time_spent = time.time() - t0 | |
191 compilation_time_metrics[key].append(time_spent) | |
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in create_aot_dispatcher_function(flat_fn, flat_args, aot_config) | |
3427 # You can put more passes here | |
3428 | |
-> 3429 compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata) | |
3430 if aot_config.is_export: | |
3431 | |
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_dedupe(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata) | |
2210 | |
2211 if ok: | |
-> 2212 return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) | |
2213 | |
2214 # export path: ban duplicate inputs for now, add later if requested. | |
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_wrapper_synthetic_base(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn) | |
2390 # Happy path: we don't need synthetic bases | |
2391 if synthetic_base_info is None: | |
-> 2392 return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) | |
2393 | |
2394 # export path: ban synthetic bases for now, add later if requested. | |
/usr/local/lib/python3.10/dist-packages/torch/_functorch/aot_autograd.py in aot_dispatch_autograd(flat_fn, flat_args, aot_config, fw_metadata) | |
2915 | |
2916 with TracingContext.report_output_strides() as fwd_output_strides: | |
-> 2917 compiled_fw_func = aot_config.fw_compiler( | |
2918 fw_module, adjusted_flat_args | |
2919 ) | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs) | |
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"): | |
188 t0 = time.time() | |
--> 189 r = func(*args, **kwargs) | |
190 time_spent = time.time() - t0 | |
191 compilation_time_metrics[key].append(time_spent) | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fw_compiler_base(model, example_inputs, is_inference) | |
1090 } | |
1091 | |
-> 1092 return inner_compile( | |
1093 model, | |
1094 example_inputs, | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/repro/after_aot.py in debug_wrapper(gm, example_inputs, **kwargs) | |
78 # Call the compiler_fn - which is either aot_autograd or inductor | |
79 # with fake inputs | |
---> 80 inner_compiled_fn = compiler_fn(gm, example_inputs) | |
81 except Exception as e: | |
82 # TODO: Failures here are troublesome because no real inputs, | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/debug.py in inner(*args, **kwargs) | |
226 def inner(*args, **kwargs): | |
227 with DebugContext(): | |
--> 228 return fn(*args, **kwargs) | |
229 | |
230 return wrap_compiler_debug(inner, compiler_name="inductor") | |
/usr/lib/python3.10/contextlib.py in inner(*args, **kwds) | |
77 def inner(*args, **kwds): | |
78 with self._recreate_cm(): | |
---> 79 return func(*args, **kwds) | |
80 return inner | |
81 | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in newFunction(*args, **kwargs) | |
52 @wraps(old_func) | |
53 def newFunction(*args, **kwargs): | |
---> 54 return old_func(*args, **kwargs) | |
55 | |
56 return newFunction | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in compile_fx_inner(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt) | |
339 } | |
340 | |
--> 341 compiled_graph: CompiledFxGraph = fx_codegen_and_compile( | |
342 *graph_args, **graph_kwargs # type: ignore[arg-type] | |
343 ) | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py in fx_codegen_and_compile(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt) | |
563 else: | |
564 context.output_strides.append(None) | |
--> 565 compiled_fn = graph.compile_to_fn() | |
566 | |
567 if graph.disable_cudagraphs: | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_fn(self) | |
968 return AotCodeCache.compile(self, code, cuda=self.cuda) | |
969 else: | |
--> 970 return self.compile_to_module().call | |
971 | |
972 def get_output_names(self): | |
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py in time_wrapper(*args, **kwargs) | |
187 with torch.profiler.record_function(f"{key} (dynamo_timed)"): | |
188 t0 = time.time() | |
--> 189 r = func(*args, **kwargs) | |
190 time_spent = time.time() - t0 | |
191 compilation_time_metrics[key].append(time_spent) | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/graph.py in compile_to_module(self) | |
939 linemap = [(line_no, node.stack_trace) for line_no, node in linemap] | |
940 key, path = PyCodeCache.write(code) | |
--> 941 mod = PyCodeCache.load_by_key_path(key, path, linemap=linemap) | |
942 self.cache_key = key | |
943 self.cache_path = path | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in load_by_key_path(cls, key, path, linemap) | |
1137 mod.__file__ = path | |
1138 mod.key = key | |
-> 1139 exec(code, mod.__dict__, mod.__dict__) | |
1140 sys.modules[mod.__name__] = mod | |
1141 # another thread might set this first | |
/tmp/torchinductor_root/36/c36twgeyj3id4letfelt2vbacz7abhe244te45y6bykwh6ycmxke.py in <module> | |
50 | |
51 | |
---> 52 async_compile.wait(globals()) | |
53 del async_compile | |
54 | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in wait(self, scope) | |
1416 pbar.set_postfix_str(key) | |
1417 if isinstance(result, (Future, TritonFuture)): | |
-> 1418 scope[key] = result.result() | |
1419 pbar.update(1) | |
1420 | |
/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py in result(self) | |
1275 return self.kernel | |
1276 # If the worker failed this will throw an exception. | |
-> 1277 self.future.result() | |
1278 kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code) | |
1279 latency = time() - t0 | |
/usr/lib/python3.10/concurrent/futures/_base.py in result(self, timeout) | |
456 raise CancelledError() | |
457 elif self._state == FINISHED: | |
--> 458 return self.__get_result() | |
459 else: | |
460 raise TimeoutError() | |
/usr/lib/python3.10/concurrent/futures/_base.py in __get_result(self) | |
401 if self._exception: | |
402 try: | |
--> 403 raise self._exception | |
404 finally: | |
405 # Break a reference cycle with the exception in self._exception | |
BackendCompilerFailed: backend='inductor' raised: | |
AssertionError: libcuda.so cannot found! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment