Created
June 28, 2024 00:32
-
-
Save leslie-fang-intel/193bb1ec096e619ff441484f94a0e2a3 to your computer and use it in GitHub Desktop.
trace log for 128513
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/torchbench.py", 0]} | |
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/common.py", 1]} | |
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/eval_frame.py", 2]} | |
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/convert_frame.py", 3]} | |
V0627 17:31:00.663000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:00.691000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 0, "size": 6552}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:00.692000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:00.692000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 0, "id": 0, "source": "L['inputs'][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 0, "size": 32768}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 0, "id": 1, "source": "L['mod'].bert.embeddings.token_type_ids"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:00.718000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "8f3f91fb1d48d67b1336de49ea694c74"} | |
class GraphModule(torch.nn.Module): | |
def forward(self): | |
# No stacktrace found for following nodes | |
_enter_autocast = torch.amp.autocast_mode._enter_autocast('cpu', None, True, None) | |
_exit_autocast = torch.amp.autocast_mode._exit_autocast(_enter_autocast); _enter_autocast = None | |
return () | |
V0627 17:31:01.398000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "845c30ca0008a08ec62276cecc47183b"} | |
class <lambda>(torch.nn.Module): | |
def forward(self): | |
return () | |
V0627 17:31:01.498000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:01.498000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "40c07a4da7b433b5416cc93985646719"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['mod'], accessed_by=DictGetItemGuardAccessor(mod) | |
| | +- ID_MATCH: ___check_obj_id(L['mod'], 139839714901824) | |
| | +- GuardManager: source=L['mod'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['mod'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['mod'].training, 7685824) | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- TYPE_MATCH: ___check_type_id(L['self'], 139842378438672) | |
| | +- GuardManager: source=L['self'].autocast, accessed_by=GetAttrGuardAccessor(autocast) | |
| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast, 139845255007760) | |
| | | +- GuardManager: source=L['self'].autocast.args, accessed_by=GetAttrGuardAccessor(args) | |
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.args, 7625984) | |
| | | | +- LENGTH_CHECK: not L['self'].autocast.args | |
| | | +- GuardManager: source=L['self'].autocast.func, accessed_by=GetAttrGuardAccessor(func) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].autocast.func, 139844826956816) | |
| | | +- GuardManager: source=L['self'].autocast.keywords, accessed_by=GetAttrGuardAccessor(keywords) | |
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.keywords, 7646656) | |
| | | | +- GuardManager: source=L['self'].autocast.keywords['device_type'], accessed_by=DictGetItemGuardAccessor(device_type) | |
| | | | | +- EQUALS_MATCH: L['self'].autocast.keywords['device_type'] == 'cpu' | |
| | +- GuardManager: source=L['self'].autocast_arg, accessed_by=GetAttrGuardAccessor(autocast_arg) | |
| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast_arg, 7646656) | |
| | | +- DICT_LENGTH: not L['self'].autocast_arg | |
| +- GuardManager: source=L['inputs'], accessed_by=DictGetItemGuardAccessor(inputs) | |
| | +- TYPE_MATCH: ___check_type_id(L['inputs'], 7625984) | |
| | +- LENGTH_CHECK: len(L['inputs']) == 1 | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['__builtins_dict___1'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___1) | |
| | | +- GuardManager: source=G['__builtins_dict___1']['dict'], accessed_by=DictGetItemGuardAccessor(dict) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['dict'], 7646656) | |
| | | +- GuardManager: source=G['__builtins_dict___1']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['isinstance'], 139845257826512) | |
V0627 17:31:01.498000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "forward_pass", "co_filename": "/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/torchbench.py", "co_firstlineno": 425, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534660.6636841, "entire_frame_compile_time_s": 0.8347411155700684, "backend_compile_time_s": 0.7748816013336182, "inductor_compile_time_s": 0.00018596649169921875, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.04396843910217285, "has_guarded_code": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.500000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/nn/modules/module.py", 4]} | |
V0627 17:31:01.500000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.513000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 6, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.513000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.514000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 6, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 6, "size": 32768}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 6, "id": 1, "source": "L['self'].bert.embeddings.token_type_ids"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 7, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 7}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 7, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.535000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:01.535000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "6017f86a7c776c49ca1dd7d3539605bb"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901824) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].bert, accessed_by=DictGetItemGuardAccessor(bert) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].bert, 139839714901584) | |
| | | | | +- GuardManager: source=L['self'].bert.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].bert.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].bert.training, 7685824) | |
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config) | |
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528) | |
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800) | |
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids) | |
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1]) | |
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False | |
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict) | |
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7636800) | |
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids) | |
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800) | |
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds) | |
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800) | |
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800) | |
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids) | |
| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800) | |
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800) | |
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800) | |
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800) | |
V0627 17:31:01.535000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "1/0", "frame_key": "6", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2382, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534661.5002189, "entire_frame_compile_time_s": 0.03560638427734375, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.022696733474731445, "has_guarded_code": true}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.536000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", 5]} | |
V0627 17:31:01.536000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 8, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 8, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 8, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 8, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 9, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 9, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.562000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 9, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.563000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.563000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 9, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:01.566000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"attention_mask": [1, 819], "l__self___embeddings_token_type_ids": [1, 4096], "buffered_token_type_ids": [1, 819], "buffered_token_type_ids_expanded": [1, 819]}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "6b7bec0701d22225fb67e6f1bfb9dc36"} | |
class GraphModule(torch.nn.Module): | |
def forward(self): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) | |
attention_mask: "f32[1, 819][819, 1]cpu" = torch.ones((1, 819), device = device(type='cpu')) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] | |
l__self___embeddings_token_type_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_token_type_ids | |
buffered_token_type_ids: "i64[1, 819][4096, 1]cpu" = l__self___embeddings_token_type_ids[(slice(None, None, None), slice(None, 819, None))]; l__self___embeddings_token_type_ids = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) | |
buffered_token_type_ids_expanded: "i64[1, 819][4096, 1]cpu" = buffered_token_type_ids.expand(1, 819); buffered_token_type_ids = None | |
return (attention_mask, buffered_token_type_ids_expanded) | |
V0627 17:31:01.581000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "1b3fb2899c356f991117f2262727f0ef"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) | |
full: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] | |
slice_1: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 0, 9223372036854775807); arg0_1 = None | |
slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 819); slice_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) | |
expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None | |
return (full, expand) | |
V0627 17:31:01.707000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "4a69dc4d0dfb43287c6abf210e06617e"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) | |
full_default: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] | |
slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819); arg0_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) | |
expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None | |
return (full_default, expand) | |
V0627 17:31:02.787000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ko/ckovbmwilyxv4sl5d74oe6jsmj25rub5gzv5kco7u66lavi64ivy.py"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0244b4281966e5f52ba168279eb45118"} | |
# AOT ID: ['1_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
cpp_fused_ones_0 = async_compile.cpp_pybinding(['float*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(float* out_ptr0) | |
{ | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(816L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = static_cast<float>(1.0); | |
auto tmp1 = at::vec::Vectorized<float>(tmp0); | |
tmp1.store(out_ptr0 + static_cast<long>(x0)); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(816L); x0<static_cast<long>(819L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = static_cast<float>(1.0); | |
out_ptr0[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg0_1, = args | |
args.clear() | |
assert_size_stride(arg0_1, (1, 4096), (4096, 1)) | |
buf0 = empty_strided_cpu((1, 819), (819, 1), torch.float32) | |
cpp_fused_ones_0(buf0) | |
return (buf0, reinterpret_tensor(arg0_1, (1, 819), (4096, 1), 0), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
arg0_1 = rand_strided((1, 4096), (4096, 1), device='cpu', dtype=torch.int64) | |
fn = lambda: call([arg0_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:02.814000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:02.815000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0b1c2f71c2e67149726041714c77db6e"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config) | |
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208) | |
| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].embeddings.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers) | |
| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_ids, accessed_by=DictGetItemGuardAccessor(token_type_ids) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_ids, 139838528701520) | |
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type) | |
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse' | |
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids) | |
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1]) | |
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False | |
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict) | |
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856) | |
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids) | |
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800) | |
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds) | |
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800) | |
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800) | |
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids) | |
| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800) | |
| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values) | |
| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800) | |
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800) | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch) | |
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800) | |
| | | +- GuardManager: source=G['torch'].ones, accessed_by=GetAttrGuardAccessor(ones) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].ones, 139845228734288) | |
| | +- GuardManager: source=G['__import_torch'], accessed_by=DictGetItemGuardAccessor(__import_torch) | |
| | | +- GuardManager: source=G['__import_torch'].fx, accessed_by=GetAttrGuardAccessor(fx) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx, 139842407409488) | |
| | | | +- GuardManager: source=G['__import_torch'].fx.Proxy, accessed_by=GetAttrGuardAccessor(Proxy) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx.Proxy, 139842429035536) | |
| | | +- GuardManager: source=G['__import_torch']._dynamo, accessed_by=GetAttrGuardAccessor(_dynamo) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo, 139839776121264) | |
| | | | +- GuardManager: source=G['__import_torch']._dynamo.is_compiling, accessed_by=GetAttrGuardAccessor(is_compiling) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo.is_compiling, 139839726529856) | |
| | +- GuardManager: source=G['__builtins_dict___9'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___9) | |
| | | +- GuardManager: source=G['__builtins_dict___9']['hasattr'], accessed_by=DictGetItemGuardAccessor(hasattr) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['hasattr'], 139845257826112) | |
| | | +- GuardManager: source=G['__builtins_dict___9']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['isinstance'], 139845257826512) | |
| | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_modeling_utils) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'], 139839661201088) | |
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch, accessed_by=GetAttrGuardAccessor(torch) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch, 139845236322800) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit, accessed_by=GetAttrGuardAccessor(jit) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit, 139842414949968) | |
| | | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, accessed_by=GetAttrGuardAccessor(is_tracing) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, 139842413687088) | |
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy, accessed_by=GetAttrGuardAccessor(is_torch_fx_proxy) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, accessed_by=GetAttrGuardAccessor(__code__) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, 139839683265264) | |
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling, accessed_by=GetAttrGuardAccessor(is_torchdynamo_compiling) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, accessed_by=GetAttrGuardAccessor(__code__) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, 139839683236192) | |
| | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_utils_dot_import_utils) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'], 139839683217824) | |
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_available, accessed_by=GetAttrGuardAccessor(_torch_available) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_available, 7685856) | |
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available, accessed_by=GetAttrGuardAccessor(is_torch_available) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, accessed_by=GetAttrGuardAccessor(__code__) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, 139839683197424) | |
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, accessed_by=GetAttrGuardAccessor(_torch_fx_available) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, 7685856) | |
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available, accessed_by=GetAttrGuardAccessor(is_torch_fx_available) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, accessed_by=GetAttrGuardAccessor(__code__) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, 139839683233376) | |
V0627 17:31:02.815000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "2/0", "frame_key": "7", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1970, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 39, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 5, "graph_input_count": 0, "start_time": 1719534661.536575, "entire_frame_compile_time_s": 1.2790420055389404, "backend_compile_time_s": 1.2300312519073486, "inductor_compile_time_s": 1.2066993713378906, "code_gen_time_s": 1.083174467086792, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.014907121658325195, "has_guarded_code": true}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.816000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 12, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 12}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 12, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 13, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 13}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 13, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.837000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:02.837000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "fce4fac5f9230c475246dd6dd52e1c05"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config) | |
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528) | |
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids) | |
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1]) | |
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['logger'], accessed_by=DictGetItemGuardAccessor(logger) | |
| | | +- ID_MATCH: ___check_obj_id(G['logger'], 139839664782448) | |
V0627 17:31:02.837000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "3/0", "frame_key": "8", "co_name": "_pad_to_block_size", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2208, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 9, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534662.816827, "entire_frame_compile_time_s": 0.0205228328704834, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.007956266403198242, "has_guarded_code": true}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.838000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2226, "name": "_pad_to_block_size", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 14, "size": 6552}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 0, "source": "L['input_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.843000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 14, "size": 3276}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.844000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31f99710>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.844000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 2, "source": "L['attention_mask']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 14, "size": 32768}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "is_view": true, "stride": [4096, 1], "storage": 2, "base": 5, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ed9f15fd0>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 4, "source": "L['token_type_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.850000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_input_ids_": [1, 819], "l_attention_mask_": [1, 819], "l_token_type_ids_": [1, 819], "input_ids": [1, 832], "attention_mask": [1, 832], "token_type_ids": [1, 832]}}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "e7d86ff372082e962b35557ebd7308fc"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_input_ids_: "i64[1, 819][819, 1]cpu", L_attention_mask_: "f32[1, 819][819, 1]cpu", L_token_type_ids_: "i64[1, 819][4096, 1]cpu"): | |
l_input_ids_ = L_input_ids_ | |
l_attention_mask_ = L_attention_mask_ | |
l_token_type_ids_ = L_token_type_ids_ | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
input_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_input_ids_, (0, 13), 'constant', 0); l_input_ids_ = None | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
attention_mask: "f32[1, 832][832, 1]cpu" = torch._C._nn.pad(l_attention_mask_, (0, 13), 'constant', False); l_attention_mask_ = None | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
token_type_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_token_type_ids_, (0, 13), 'constant', 0); l_token_type_ids_ = None | |
return (input_ids, attention_mask, token_type_ids) | |
V0627 17:31:02.865000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"): | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None | |
return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2) | |
V0627 17:31:02.875000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"): | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None | |
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value) | |
constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None | |
return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2) | |
V0627 17:31:02.904000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/fj/cfjbx53t7urpjz2ng6piwvoj4vsjiqa2xrgjcrh4vovq4w45eywv.py"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "3758e3875a0e606fcec57aeffa852874"} | |
# AOT ID: ['2_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
cpp_fused_constant_pad_nd_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'int64_t*', 'float*', 'int64_t*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const int64_t* in_ptr0, | |
const float* in_ptr1, | |
const int64_t* in_ptr2, | |
int64_t* out_ptr0, | |
float* out_ptr1, | |
int64_t* out_ptr2) | |
{ | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = x0; | |
auto tmp1 = c10::convert<int32_t>(tmp0); | |
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1); | |
auto tmp3 = static_cast<int32_t>(819); | |
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3); | |
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4); | |
auto tmp6 = [&] | |
{ | |
auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr0 + static_cast<long>(x0)); | |
return tmp7; | |
} | |
; | |
auto tmp10 = | |
[&] | |
{ | |
if (tmp5.all_zero()) | |
{ | |
return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0)); | |
} | |
else | |
{ | |
auto tmp8 = tmp6(); | |
auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0)); | |
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>()); | |
} | |
} | |
() | |
; | |
tmp10.store(out_ptr0 + static_cast<long>(x0), 16); | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = x0; | |
auto tmp1 = c10::convert<int32_t>(tmp0); | |
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1); | |
auto tmp3 = static_cast<int32_t>(819); | |
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3); | |
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4); | |
auto tmp6 = [&] | |
{ | |
auto tmp7 = tmp5.template cast<float,1>().template loadu<float,1>(in_ptr1 + static_cast<long>(x0)); | |
return tmp7; | |
} | |
; | |
auto tmp10 = | |
[&] | |
{ | |
if (tmp5.all_zero()) | |
{ | |
return at::vec::Vectorized<float>(static_cast<float>(0.0)); | |
} | |
else | |
{ | |
auto tmp8 = tmp6(); | |
auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(0.0)); | |
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<float,1>()); | |
} | |
} | |
() | |
; | |
tmp10.store(out_ptr1 + static_cast<long>(x0)); | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = x0; | |
auto tmp1 = c10::convert<int32_t>(tmp0); | |
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1); | |
auto tmp3 = static_cast<int32_t>(819); | |
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3); | |
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4); | |
auto tmp6 = [&] | |
{ | |
auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr2 + static_cast<long>(x0)); | |
return tmp7; | |
} | |
; | |
auto tmp10 = | |
[&] | |
{ | |
if (tmp5.all_zero()) | |
{ | |
return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0)); | |
} | |
else | |
{ | |
auto tmp8 = tmp6(); | |
auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0)); | |
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>()); | |
} | |
} | |
() | |
; | |
tmp10.store(out_ptr2 + static_cast<long>(x0), 16); | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg0_1, arg1_1, arg2_1 = args | |
args.clear() | |
assert_size_stride(arg0_1, (1, 819), (819, 1)) | |
assert_size_stride(arg1_1, (1, 819), (819, 1)) | |
assert_size_stride(arg2_1, (1, 819), (4096, 1)) | |
buf0 = empty_strided_cpu((1, 832), (832, 1), torch.int64) | |
buf1 = empty_strided_cpu((1, 832), (832, 1), torch.float32) | |
buf2 = empty_strided_cpu((1, 832), (832, 1), torch.int64) | |
cpp_fused_constant_pad_nd_0(arg0_1, arg1_1, arg2_1, buf0, buf1, buf2) | |
del arg0_1 | |
del arg1_1 | |
del arg2_1 | |
return (buf0, buf1, buf2, ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
arg0_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.int64) | |
arg1_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.float32) | |
arg2_1 = rand_strided((1, 819), (4096, 1), device='cpu', dtype=torch.int64) | |
fn = lambda: call([arg0_1, arg1_1, arg2_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:02.910000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:02.911000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "ae71e2a61c1f7e9b1434b71d14d096e3"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids) | |
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1]) | |
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids']) | |
| +- GuardManager: source=L['padding_len'], accessed_by=DictGetItemGuardAccessor(padding_len) | |
| | +- EQUALS_MATCH: L['padding_len'] == 13 | |
| +- GuardManager: source=L['pad_token_id'], accessed_by=DictGetItemGuardAccessor(pad_token_id) | |
| | +- EQUALS_MATCH: L['pad_token_id'] == 0 | |
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids) | |
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800) | |
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds) | |
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800) | |
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['attention_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 819], stride=[819, 1]) | |
| | +- NO_HASATTR: hasattr(L['attention_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids']) | |
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids) | |
| | +- TENSOR_MATCH: check_tensor(L['token_type_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[4096, 1]) | |
| | +- NO_HASATTR: hasattr(L['token_type_ids'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids']) | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn) | |
| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680) | |
| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional) | |
| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024) | |
| | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=GetAttrGuardAccessor(pad) | |
| | | | | +- GuardManager: source=G['nn'].functional.pad.__code__, accessed_by=GetAttrGuardAccessor(__code__) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.pad.__code__, 139842439629440) | |
| | | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=FuncDefaultsGuardAccessor | |
| | | | | | +- GuardManager: source=G['nn'].functional.pad.__defaults__[0], accessed_by=GetItemGuardAccessor(0) | |
| | | | | | | +- EQUALS_MATCH: G['nn'].functional.pad.__defaults__[0] == 'constant' | |
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_functional) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'], 139842441627024) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch, accessed_by=GetAttrGuardAccessor(torch) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch, 139845236322800) | |
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C, accessed_by=GetAttrGuardAccessor(_C) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C, 139845228547104) | |
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn, accessed_by=GetAttrGuardAccessor(_nn) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn, 139842445377216) | |
| | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, accessed_by=GetAttrGuardAccessor(pad) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, 139842445416928) | |
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit, accessed_by=GetAttrGuardAccessor(jit) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit, 139842414949968) | |
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, accessed_by=GetAttrGuardAccessor(is_scripting) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, 139842422983696) | |
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, accessed_by=GetAttrGuardAccessor(are_deterministic_algorithms_enabled) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, 139842451619504) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, accessed_by=GetAttrGuardAccessor(has_torch_function_unary) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, 139845228559104) | |
V0627 17:31:02.911000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "4/0", "frame_key": "9", "co_name": "torch_dynamo_resume_in__pad_to_block_size_at_2226", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2226, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 26, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534662.838091, "entire_frame_compile_time_s": 0.07323813438415527, "backend_compile_time_s": 0.05719876289367676, "inductor_compile_time_s": 0.03380870819091797, "code_gen_time_s": 0.027545690536499023, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.912000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 16, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.916000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.916000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 16, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 16, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:02.997000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 17, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 17, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 3, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.027000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_1_": [1, 832], "l_stack0_2_": [1, 832], "l_stack0_3_": [1, 832], "blocked_encoder_mask": [1, 13, 64], "getitem": [1, 9, 64], "getitem_1": [1, 9, 64], "getitem_2": [1, 9, 64], "exp_blocked_to_pad": [1, 9, 192], "getitem_3": [1, 9, 64], "band_mask": [1, 1, 9, 64, 192], "unsqueeze_": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832], "l__self___embeddings_position_ids": [1, 4096], "position_ids": [1, 832], "inputs_embeds": [1, 832, 768], "token_type_embeddings": [1, 832, 768], "embeddings": [1, 832, 768], "position_embeddings": [1, 832, 768], "embeddings_1": [1, 832, 768], "embeddings_2": [1, 832, 768], "embeddings_3": [1, 832, 768]}}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "5bf8fff16cea4127a0a6b6a6800ef31a"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_1_: "i64[1, 832][832, 1]cpu", L_stack0_2_: "f32[1, 832][832, 1]cpu", L_stack0_3_: "i64[1, 832][832, 1]cpu"): | |
l_stack0_1_ = L_stack0_1_ | |
l_stack0_2_ = L_stack0_2_ | |
l_stack0_3_ = L_stack0_3_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) | |
blocked_encoder_mask: "f32[1, 13, 64][832, 64, 1]cpu" = l_stack0_2_.view(1, 13, 64) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 | |
getitem: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(1, -3, None))] | |
getitem_1: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))] | |
getitem_2: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(3, -1, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat( | |
exp_blocked_to_pad: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.cat([getitem, getitem_1, getitem_2], dim = 2); getitem = getitem_1 = getitem_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) | |
getitem_3: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))] | |
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.functional.einsum('blq,blk->blqk', getitem_3, exp_blocked_to_pad); getitem_3 = exp_blocked_to_pad = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1) | |
unsqueeze_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = band_mask.unsqueeze_(1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1) | |
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_stack0_2_.view(1, 1, 832, 1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length) | |
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_stack0_2_.view(1, 1, 1, 832); l_stack0_2_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:282 in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] | |
l__self___embeddings_position_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_position_ids | |
position_ids: "i64[1, 832][4096, 1]cpu" = l__self___embeddings_position_ids[(slice(None, None, None), slice(0, 832, None))]; l__self___embeddings_position_ids = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids) | |
inputs_embeds: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_word_embeddings(l_stack0_1_); l_stack0_1_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids) | |
token_type_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_token_type_embeddings(l_stack0_3_); l_stack0_3_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings | |
embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = inputs_embeds + token_type_embeddings; inputs_embeds = token_type_embeddings = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids) | |
position_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_position_embeddings(position_ids); position_ids = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings | |
embeddings += position_embeddings; embeddings_1: "f32[1, 832, 768][638976, 768, 1]cpu" = embeddings; embeddings = position_embeddings = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:308 in forward, code: embeddings = self.dropout(embeddings) | |
embeddings_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_dropout(embeddings_1); embeddings_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings) | |
embeddings_3: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_LayerNorm(embeddings_2); embeddings_2 = None | |
return (embeddings_3, band_mask, from_mask, to_mask, blocked_encoder_mask) | |
V0627 17:31:03.091000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f7b6ff7875cdbc7ff1ea7b5f6bc39ed2"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[50358, 768][768, 1]cpu", arg1_1: "f32[2, 768][768, 1]cpu", arg2_1: "f32[4096, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "i64[1, 4096][4096, 1]cpu", arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) | |
view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 13, 64]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 | |
slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807) | |
slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -3); slice_1 = None | |
slice_3: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807) | |
slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 2, -2); slice_3 = None | |
slice_5: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807) | |
slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 3, -1); slice_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat( | |
cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2); slice_2 = slice_4 = slice_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) | |
slice_7: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807) | |
slice_8: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 2, -2); slice_7 = None | |
unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_8, 3); slice_8 = None | |
permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None | |
unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None | |
permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None | |
mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1) | |
unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1) | |
view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 832, 1]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length) | |
view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:282 in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] | |
slice_9: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 0, 9223372036854775807); arg5_1 = None | |
slice_10: "i64[1, 832][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 832); slice_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids) | |
embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg0_1, arg6_1, 0); arg0_1 = arg6_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids) | |
embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg1_1, arg8_1); arg1_1 = arg8_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings | |
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids) | |
embedding_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg2_1, slice_10); arg2_1 = slice_10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings | |
add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, embedding_2); add = embedding_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:308 in forward, code: embeddings = self.dropout(embeddings) | |
clone: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(add_1); add_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings) | |
var_mean = torch.ops.aten.var_mean.correction(clone, [2], correction = 0, keepdim = True) | |
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0] | |
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None | |
add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None | |
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2); add_2 = None | |
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(clone, getitem_1); clone = getitem_1 = None | |
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None | |
mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, arg3_1); mul_1 = arg3_1 = None | |
add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, arg4_1); mul_2 = arg4_1 = None | |
return (add_3, unsqueeze_2, view_1, view_2, view) | |
V0627 17:31:03.161000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "06e74f82fcfa9d791dc26355727799db"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"): | |
# No stacktrace found for following nodes | |
_frozen_param0: "f32[50358, 768][768, 1]cpu" = self._frozen_param0 | |
_frozen_param1: "f32[2, 768][768, 1]cpu" = self._frozen_param1 | |
_frozen_param3: "f32[768][1]cpu" = self._frozen_param3 | |
_frozen_param4: "f32[768][1]cpu" = self._frozen_param4 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids) | |
_frozen_param6: "f32[1, 832, 768][638976, 768, 1]cpu" = self._frozen_param6 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids) | |
embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0); _frozen_param0 = arg6_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids) | |
embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1); _frozen_param1 = arg8_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings | |
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings | |
add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, _frozen_param6); add = _frozen_param6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings) | |
var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True) | |
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0] | |
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None | |
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_1, getitem_1); add_1 = getitem_1 = None | |
add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None | |
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2); add_2 = None | |
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None | |
mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None | |
add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4); mul_2 = _frozen_param4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) | |
view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 | |
slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 2, -2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) | |
unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_4, 3) | |
permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 | |
slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 1, -3) | |
slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 3, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat( | |
cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2); slice_2 = slice_4 = slice_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) | |
unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None | |
permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None | |
mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1) | |
unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1) | |
view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length) | |
view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None | |
return (add_3, unsqueeze_2, view_1, view_2, view) | |
V0627 17:31:03.519000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/de/cdewao76edq6vrvflsagsrjktsdjwfpzvsaaft6tyecuomopfso3.py"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f30f2b373864eaff49baf96db8ab8cb7"} | |
# AOT ID: ['3_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
_frozen_param0 = None # device(type='cpu') torch.float32 (50358, 768) (768, 1) 7f2eb1d44630 | |
_frozen_param1 = None # device(type='cpu') torch.float32 (2, 768) (768, 1) 7f2eb1d445e0 | |
_frozen_param3 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44540 | |
_frozen_param4 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44810 | |
_frozen_param6 = None # device(type='cpu') torch.float32 (1, 832, 768) (638976, 768, 1) 7f2e3165ccc0 | |
cpp_fused_add_cat_embedding_mul_native_layer_norm_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const int64_t* in_ptr0, | |
const float* in_ptr1, | |
const int64_t* in_ptr2, | |
const float* in_ptr3, | |
const float* in_ptr4, | |
const float* in_ptr5, | |
const float* in_ptr6, | |
const float* in_ptr7, | |
const float* in_ptr8, | |
float* out_ptr0, | |
float* out_ptr1, | |
float* out_ptr2, | |
float* out_ptr3, | |
float* out_ptr4, | |
float* out_ptr5, | |
float* out_ptr6, | |
float* out_ptr7) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
Welford<float> tmp_acc0 = Welford<float>(); | |
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>(); | |
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L)); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = in_ptr0[static_cast<long>(x0)]; | |
auto tmp10 = in_ptr2[static_cast<long>(x0)]; | |
auto tmp21 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp1 = 50358L; | |
auto tmp2 = c10::convert<int64_t>(tmp1); | |
auto tmp3 = decltype(tmp0)(tmp0 + tmp2); | |
auto tmp4 = tmp0 < 0; | |
auto tmp5 = tmp4 ? tmp3 : tmp0; | |
auto tmp6 = tmp5; | |
auto tmp7 = c10::convert<int64_t>(tmp6); | |
TORCH_CHECK((0 <= tmp7) & (tmp7 < 50358L), "index out of bounds: 0 <= tmp7 < 50358L"); | |
auto tmp9 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*tmp5)), 16); | |
auto tmp11 = 2L; | |
auto tmp12 = c10::convert<int64_t>(tmp11); | |
auto tmp13 = decltype(tmp10)(tmp10 + tmp12); | |
auto tmp14 = tmp10 < 0; | |
auto tmp15 = tmp14 ? tmp13 : tmp10; | |
auto tmp16 = tmp15; | |
auto tmp17 = c10::convert<int64_t>(tmp16); | |
TORCH_CHECK((0 <= tmp17) & (tmp17 < 2L), "index out of bounds: 0 <= tmp17 < 2L"); | |
auto tmp19 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (768L*tmp15)), 16); | |
auto tmp20 = tmp9 + tmp19; | |
auto tmp22 = tmp20 + tmp21; | |
tmp22.store(out_ptr0 + static_cast<long>(x1 + (768L*x0))); | |
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp22, &weight_recps); | |
} | |
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec)); | |
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean); | |
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp1 = out_ptr1[static_cast<long>(x0)]; | |
auto tmp4 = out_ptr2[static_cast<long>(x0)]; | |
auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1), 16); | |
auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr6 + static_cast<long>(x1), 16); | |
auto tmp2 = at::vec::Vectorized<float>(tmp1); | |
auto tmp3 = tmp0 - tmp2; | |
auto tmp5 = static_cast<float>(768.0); | |
auto tmp6 = tmp4 / tmp5; | |
auto tmp7 = static_cast<float>(1e-12); | |
auto tmp8 = decltype(tmp6)(tmp6 + tmp7); | |
auto tmp9 = 1 / std::sqrt(tmp8); | |
auto tmp10 = at::vec::Vectorized<float>(tmp9); | |
auto tmp11 = tmp3 * tmp10; | |
auto tmp13 = tmp11 * tmp12; | |
auto tmp15 = tmp13 + tmp14; | |
tmp15.store(out_ptr3 + static_cast<long>(x1 + (768L*x0))); | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(64L + x1 + (64L*x0)), 16); | |
tmp0.store(out_ptr4 + static_cast<long>(x1 + (192L*x0))); | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(128L + x1 + (64L*x0)), 16); | |
tmp0.store(out_ptr5 + static_cast<long>(x1 + (192L*x0))); | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(192L + x1 + (64L*x0)), 16); | |
tmp0.store(out_ptr6 + static_cast<long>(x1 + (192L*x0))); | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = in_ptr7[static_cast<long>(128L + x1 + (64L*x0))]; | |
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr8 + static_cast<long>(x2 + (192L*x0)), 16); | |
auto tmp2 = at::vec::Vectorized<float>(tmp0); | |
auto tmp3 = tmp2 * tmp1; | |
tmp3.store(out_ptr7 + static_cast<long>(x2 + (192L*x1) + (12288L*x0))); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg6_1, arg7_1, arg8_1 = args | |
args.clear() | |
assert_size_stride(arg6_1, (1, 832), (832, 1)) | |
assert_size_stride(arg7_1, (1, 832), (832, 1)) | |
assert_size_stride(arg8_1, (1, 832), (832, 1)) | |
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32) | |
buf1 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32) | |
buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32) | |
buf4 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32) | |
buf8 = empty_strided_cpu((1, 9, 192), (1728, 192, 1), torch.float32) | |
buf5 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 0) # alias | |
buf6 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 64) # alias | |
buf7 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 128) # alias | |
buf9 = empty_strided_cpu((1, 9, 64, 192), (110592, 12288, 192, 1), torch.float32) | |
cpp_fused_add_cat_embedding_mul_native_layer_norm_0(arg6_1, _frozen_param0, arg8_1, _frozen_param1, _frozen_param6, _frozen_param3, _frozen_param4, arg7_1, buf8, buf0, buf1, buf2, buf4, buf5, buf6, buf7, buf9) | |
del arg6_1 | |
del arg8_1 | |
return (buf4, reinterpret_tensor(buf9, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 832, 1), (832, 832, 1, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 1, 832), (832, 832, 832, 1), 0), reinterpret_tensor(arg7_1, (1, 13, 64), (832, 64, 1), 0), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
global _frozen_param0 | |
_frozen_param0 = rand_strided((50358, 768), (768, 1), device='cpu', dtype=torch.float32) | |
global _frozen_param1 | |
_frozen_param1 = rand_strided((2, 768), (768, 1), device='cpu', dtype=torch.float32) | |
global _frozen_param3 | |
_frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32) | |
global _frozen_param4 | |
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32) | |
global _frozen_param6 | |
_frozen_param6 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32) | |
arg6_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64) | |
arg7_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.float32) | |
arg8_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64) | |
fn = lambda: call([arg6_1, arg7_1, arg8_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:03.542000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:03.542000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "1dfadefa57d2d698b82df0a252ee757b"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config) | |
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].encoder, accessed_by=DictGetItemGuardAccessor(encoder) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].encoder, 139839713378016) | |
| | | | | +- GuardManager: source=L['self'].encoder.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].encoder.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].encoder.training, 7685824) | |
| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208) | |
| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].embeddings.__dict__) | |
| | | | | | +- GuardManager: source=L['self'].embeddings.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].embeddings._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | | | | +- GuardManager: source=L['self'].embeddings.dropout, accessed_by=DictGetItemGuardAccessor(dropout) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.dropout, 139839202278704) | |
| | | | | | | | +- GuardManager: source=L['self'].embeddings.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.dropout.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.dropout.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.LayerNorm, 139839202278800) | |
| | | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.LayerNorm.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings, accessed_by=DictGetItemGuardAccessor(word_embeddings) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.word_embeddings, 139839202271840) | |
| | | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.word_embeddings.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings, accessed_by=DictGetItemGuardAccessor(position_embeddings) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_embeddings, 139839202279184) | |
| | | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_embeddings.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings, accessed_by=DictGetItemGuardAccessor(token_type_embeddings) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_embeddings, 139839202279328) | |
| | | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_embeddings.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers) | |
| | | | | | | +- GuardManager: source=L['self'].embeddings.position_ids, accessed_by=DictGetItemGuardAccessor(position_ids) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_ids, 139838528701040) | |
| | | | | | +- GuardManager: source=L['self'].embeddings._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].embeddings._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].embeddings._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks) | |
| | | | | | +- GuardManager: source=L['self'].embeddings.rescale_embeddings, accessed_by=DictGetItemGuardAccessor(rescale_embeddings) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.rescale_embeddings, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].embeddings._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks) | |
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size) | |
| | | | +- EQUALS_MATCH: L['self'].block_size == 64 | |
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type) | |
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse' | |
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0) | |
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984) | |
| | +- LENGTH_CHECK: len(L['___stack0']) == 6 | |
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0) | |
| | | +- EQUALS_MATCH: L['___stack0'][0] == 13 | |
| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1) | |
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][1], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1]) | |
| | | +- NO_HASATTR: hasattr(L['___stack0'][1], '_dynamo_dynamic_indices') == False | |
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3]) | |
| | +- GuardManager: source=L['___stack0'][2], accessed_by=TupleGetItemGuardAccessor(2) | |
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][2], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832], stride=[832, 1]) | |
| | | +- NO_HASATTR: hasattr(L['___stack0'][2], '_dynamo_dynamic_indices') == False | |
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3]) | |
| | +- GuardManager: source=L['___stack0'][3], accessed_by=TupleGetItemGuardAccessor(3) | |
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][3], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1]) | |
| | | +- NO_HASATTR: hasattr(L['___stack0'][3], '_dynamo_dynamic_indices') == False | |
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3]) | |
| | +- GuardManager: source=L['___stack0'][4], accessed_by=TupleGetItemGuardAccessor(4) | |
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][4], 7636800) | |
| | +- GuardManager: source=L['___stack0'][5], accessed_by=TupleGetItemGuardAccessor(5) | |
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][5], 7636800) | |
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800) | |
| +- GuardManager: source=L['use_cache'], accessed_by=DictGetItemGuardAccessor(use_cache) | |
| | +- ID_MATCH: ___check_obj_id(L['use_cache'], 7685824) | |
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict) | |
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856) | |
| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values) | |
| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7685824) | |
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800) | |
| +- GuardManager: source=L['past_key_values_length'], accessed_by=DictGetItemGuardAccessor(past_key_values_length) | |
| | +- EQUALS_MATCH: L['past_key_values_length'] == 0 | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch) | |
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800) | |
| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672) | |
| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568) | |
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800) | |
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104) | |
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks | |
V0627 17:31:03.543000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "5/0", "frame_key": "10", "co_name": "torch_dynamo_resume_in_forward_at_2077", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2077, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 57, "shape_env_guard_count": 0, "graph_op_count": 18, "graph_node_count": 23, "graph_input_count": 3, "start_time": 1719534662.9121282, "entire_frame_compile_time_s": 0.6307895183563232, "backend_compile_time_s": 0.49609994888305664, "inductor_compile_time_s": 0.37875938415527344, "code_gen_time_s": 0.3245351314544678, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.08347654342651367, "has_guarded_code": true}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.544000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 19, "size": 442368}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 0, "source": "L['band_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 19, "size": 2555904}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 1, "source": "L['hidden_states']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.567000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 19, "size": 3328}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 2, "source": "L['from_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.570000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.570000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 5, "source": "L['to_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.600000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "6/0", "frame_key": "11", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1578, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534663.5449224, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.05504441261291504, "has_guarded_code": false}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.600000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.607000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 20, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.608000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.608000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 20, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 20, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.611000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.612000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.612000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 21, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 21, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 21, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.646000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.647000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.648000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.648000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.656000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:03.656000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "76aa2c3aac969b0b973556e5e5d20d8b"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202275632) | |
| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].attention.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention.training, 7685824) | |
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1]) | |
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1]) | |
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800) | |
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states) | |
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800) | |
| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value) | |
| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1]) | |
| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800) | |
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800) | |
V0627 17:31:03.657000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "7/0", "frame_key": "12", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534663.6010149, "entire_frame_compile_time_s": 0.05594229698181152, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.039438724517822266, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.657000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 22, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 22, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.661000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 22, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.664000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.664000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 23, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 23, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.696000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 23, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.696000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.697000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.697000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.698000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.699000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.701000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.701000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.704000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"): | |
l_band_mask_ = L_band_mask_ | |
l_from_mask_ = L_from_mask_ | |
l_to_mask_ = L_to_mask_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1383 in forward, code: band_mask = band_mask.to(hidden_states.dtype) | |
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = l_band_mask_.to(torch.float32); l_band_mask_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1385 in forward, code: from_mask = from_mask.to(hidden_states.dtype) | |
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_from_mask_.to(torch.float32); l_from_mask_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1387 in forward, code: to_mask = to_mask.to(hidden_states.dtype) | |
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_to_mask_.to(torch.float32); l_to_mask_ = None | |
return (band_mask, from_mask, to_mask) | |
V0627 17:31:03.718000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"): | |
return (arg0_1, arg1_1, arg2_1) | |
V0627 17:31:03.731000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:03.731000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "6f76e9e822f6dc2ebb0dbc0f0100927d"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202274384) | |
| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].self.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].self.training, 7685824) | |
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type) | |
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse' | |
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1]) | |
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1]) | |
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states) | |
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800) | |
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask) | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
V0627 17:31:03.732000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "8/0", "frame_key": "13", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534663.657894, "entire_frame_compile_time_s": 0.07398724555969238, "backend_compile_time_s": 0.02206587791442871, "inductor_compile_time_s": 0.0003921985626220703, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03484821319580078, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.732000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 25, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 25}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 25, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 26, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 26, "size": 442368}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 26, "size": 3328}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.765000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.765000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.766000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.766000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.768000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
l_hidden_states_ = L_hidden_states_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64); l__self___query = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3); x = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64); l__self___key = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3); x_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_); l_hidden_states_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64); l__self___value = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3); x_2 = None | |
return (query_layer, key_layer, value_layer) | |
V0627 17:31:03.817000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None | |
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16) | |
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None | |
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None | |
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None | |
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]); view_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None | |
convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None | |
convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16) | |
view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]); convert_element_type_8 = None | |
permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None | |
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2); convert_element_type_6 = view_3 = permute_2 = None | |
view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]); view_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16); arg5_1 = None | |
convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None | |
convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None | |
view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]); convert_element_type_14 = None | |
permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]); convert_element_type_13 = None | |
addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4); convert_element_type_12 = view_6 = permute_4 = None | |
view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]); addmm_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]); view_7 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None | |
return (permute_1, permute_3, permute_5) | |
V0627 17:31:03.886000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
_frozen_param6: "bf16[768][1]cpu" = self._frozen_param6 | |
# No stacktrace found for following nodes | |
_frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8 | |
# No stacktrace found for following nodes | |
_frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
_frozen_param10: "bf16[768][1]cpu" = self._frozen_param10 | |
# No stacktrace found for following nodes | |
_frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None | |
_linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
_linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
_linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None | |
return (permute_1, permute_3, permute_5) | |
V0627 17:31:03.909000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/rm/crmmdl3pvsdue2ht6qffev3qnvhhdsc4zixorhqtjreztfur5zhi.py"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "a40072c55bb96853547fea577aa47ba2"} | |
# AOT ID: ['5_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311efab0 | |
_frozen_param12 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311c2750 | |
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311b8770 | |
_frozen_param13 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311cefc0 | |
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311dbbf0 | |
_frozen_param14 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311ac090 | |
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const float* in_ptr0, | |
bfloat16* out_ptr0) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); | |
auto tmp1 = at::vec::convert<bfloat16>(tmp0); | |
tmp1.store(out_ptr0 + static_cast<long>(x0), 16); | |
} | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg6_1, = args | |
args.clear() | |
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1)) | |
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16) | |
cpp_fused__to_copy_0(arg6_1, buf0) | |
del arg6_1 | |
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '') | |
buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '') | |
buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '') | |
return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
global _frozen_param6 | |
_frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param12 | |
_frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param8 | |
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param13 | |
_frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param10 | |
_frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param14 | |
_frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32) | |
fn = lambda: call([arg6_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:03.922000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:03.923000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "300400f770725170203fcbe28e6ee223"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202273568) | |
| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].key.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].key.training, 7685824) | |
| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202273616) | |
| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].query.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].query.training, 7685824) | |
| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202273040) | |
| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].value.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].value.training, 7685824) | |
| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed) | |
| | | | +- EQUALS_MATCH: L['self'].seed == 0 | |
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size) | |
| | | | +- EQUALS_MATCH: L['self'].block_size == 64 | |
| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks) | |
| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3 | |
| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size) | |
| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64 | |
| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads) | |
| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12 | |
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1]) | |
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1]) | |
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states) | |
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask) | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
V0627 17:31:03.923000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "9/0", "frame_key": "14", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534663.7326608, "entire_frame_compile_time_s": 0.19047832489013672, "backend_compile_time_s": 0.14537477493286133, "inductor_compile_time_s": 0.0376286506652832, "code_gen_time_s": 0.016646862030029297, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.018494129180908203, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.925000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:03.985000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:03.986000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "f6f5661bfad0dc293ecc9ef35ede39a0"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed) | |
| | +- EQUALS_MATCH: L['seed'] == 0 | |
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size) | |
| | +- EQUALS_MATCH: L['batch_size'] == 1 | |
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len) | |
| | +- EQUALS_MATCH: L['to_seq_len'] == 832 | |
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len) | |
| | +- EQUALS_MATCH: L['from_seq_len'] == 832 | |
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size) | |
| | +- EQUALS_MATCH: L['to_block_size'] == 64 | |
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size) | |
| | +- EQUALS_MATCH: L['from_block_size'] == 64 | |
| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size) | |
| | +- EQUALS_MATCH: L['attention_head_size'] == 64 | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np) | |
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488) | |
| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random) | |
| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464) | |
| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264) | |
| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math) | |
| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744) | |
| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt) | |
| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344) | |
V0627 17:31:03.986000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "10/0", "frame_key": "15", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 516, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.9255476, "entire_frame_compile_time_s": 0.06077218055725098, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.008862972259521484, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:03.987000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.076000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:04.077000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "f3df28d4d21dab674ac56179543067e7"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads) | |
| | +- EQUALS_MATCH: L['n_heads'] == 12 | |
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len) | |
| | +- EQUALS_MATCH: L['to_seq_len'] == 832 | |
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len) | |
| | +- EQUALS_MATCH: L['from_seq_len'] == 832 | |
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks) | |
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3 | |
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size) | |
| | +- EQUALS_MATCH: L['to_block_size'] == 64 | |
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size) | |
| | +- EQUALS_MATCH: L['from_block_size'] == 64 | |
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length) | |
| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800) | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['__builtins_dict___37'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___37) | |
| | | +- GuardManager: source=G['__builtins_dict___37']['int'], accessed_by=DictGetItemGuardAccessor(int) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___37']['int'], 7648640) | |
V0627 17:31:04.077000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "11/0", "frame_key": "16", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.987453, "entire_frame_compile_time_s": 0.08972334861755371, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.02129364013671875, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:04.078000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 12, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.094000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"wrapped_array": [2], "plan_block_length": [2]}}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "9f82d9593d608d32ba61e6298aeb3649"} | |
class GraphModule(torch.nn.Module): | |
def forward(self): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size | |
wrapped_array: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_array([704, 832]) | |
plan_block_length: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_floordiv(wrapped_array, 64); wrapped_array = None | |
return (plan_block_length,) | |
V0627 17:31:04.106000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "1861734e40a5f61860344b326195085c"} | |
class <lambda>(torch.nn.Module): | |
def forward(self): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size | |
_tensor_constant0 = self._tensor_constant0 | |
lift_fresh_copy: "i64[2][1]cpu" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0); _tensor_constant0 = None | |
clone: "i64[2][1]cpu" = torch.ops.aten.clone.default(lift_fresh_copy); lift_fresh_copy = None | |
div: "i64[2][1]cpu" = torch.ops.aten.div.Tensor_mode(clone, 64, rounding_mode = 'floor'); clone = None | |
return (div,) | |
V0627 17:31:04.122000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:04.122000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "c1fc05dff62c1bc070ea06a12430d940"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads) | |
| | +- EQUALS_MATCH: L['num_heads'] == 12 | |
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size) | |
| | +- EQUALS_MATCH: L['to_block_size'] == 64 | |
| +- GuardManager: source=L['to_seq_length'], accessed_by=DictGetItemGuardAccessor(to_seq_length) | |
| | +- EQUALS_MATCH: L['to_seq_length'] == 832 | |
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size) | |
| | +- EQUALS_MATCH: L['from_block_size'] == 64 | |
| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length) | |
| | +- EQUALS_MATCH: L['from_seq_length'] == 832 | |
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length) | |
| | +- TYPE_MATCH: ___check_type_id(L['plan_from_length'], 7650400) | |
| | +- LENGTH_CHECK: len(L['plan_from_length']) == 2 | |
| | +- GuardManager: source=L['plan_from_length'][0], accessed_by=ListGetItemGuardAccessor(0) | |
| | | +- EQUALS_MATCH: L['plan_from_length'][0] == 704 | |
| | +- GuardManager: source=L['plan_from_length'][1], accessed_by=ListGetItemGuardAccessor(1) | |
| | | +- EQUALS_MATCH: L['plan_from_length'][1] == 832 | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np) | |
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488) | |
| | | +- GuardManager: source=G['np'].array, accessed_by=GetAttrGuardAccessor(array) | |
| | | | +- ID_MATCH: ___check_obj_id(G['np'].array, 139845228959664) | |
| | +- GuardManager: source=G['__builtins_dict___40'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___40) | |
| | | +- GuardManager: source=G['__builtins_dict___40']['list'], accessed_by=DictGetItemGuardAccessor(list) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['list'], 7650400) | |
| | | +- GuardManager: source=G['__builtins_dict___40']['range'], accessed_by=DictGetItemGuardAccessor(range) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['range'], 7632448) | |
| | | +- GuardManager: source=G['__builtins_dict___40']['enumerate'], accessed_by=DictGetItemGuardAccessor(enumerate) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['enumerate'], 7513024) | |
| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336) | |
V0627 17:31:04.122000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "12/0", "frame_key": "17", "co_name": "_bigbird_block_rand_mask_with_head", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1111, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534664.0783854, "entire_frame_compile_time_s": 0.04439258575439453, "backend_compile_time_s": 0.01859426498413086, "inductor_compile_time_s": 0.00021767616271972656, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.00835871696472168, "has_guarded_code": true}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1} | |
V0627 17:31:04.123000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.127000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "13/0", "frame_key": "18", "co_name": "<listcomp>", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534664.1237168, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0032224655151367188, "has_guarded_code": false}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.127000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dafc0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c3510>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 3, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db3dd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 4, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1fd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 5, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311daf70>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 6, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.139000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.139000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311b8ea0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.140000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 7, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ba200>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 8, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119dd50>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 9, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c6d0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 10, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.145000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.146000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c4a0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.146000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 11, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 12, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 12, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.148000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.149000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4df30>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.149000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 13, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.153000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"): | |
l_stack0_0_ = L_stack0_0_ | |
l_stack0_1_ = L_stack0_1_ | |
l_stack0_2_ = L_stack0_2_ | |
l_stack0_3_ = L_stack0_3_ | |
l_stack0_4_ = L_stack0_4_ | |
l_stack0_5_ = L_stack0_5_ | |
l_stack0_6_ = L_stack0_6_ | |
l_stack0_7_ = L_stack0_7_ | |
l_stack0_8_ = L_stack0_8_ | |
l_stack0_9_ = L_stack0_9_ | |
l_stack0_10_ = L_stack0_10_ | |
l_stack0_11_ = L_stack0_11_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] | |
wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None))); l_stack0_0_ = None | |
wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None))); l_stack0_1_ = None | |
wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None))); l_stack0_2_ = None | |
wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None))); l_stack0_3_ = None | |
wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None))); l_stack0_4_ = None | |
wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None))); l_stack0_5_ = None | |
wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None))); l_stack0_6_ = None | |
wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None))); l_stack0_7_ = None | |
wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None))); l_stack0_8_ = None | |
wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None))); l_stack0_9_ = None | |
wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None))); l_stack0_10_ = None | |
wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None))); l_stack0_11_ = None | |
return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11) | |
V0627 17:31:04.205000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] | |
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None | |
slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807); slice_1 = None | |
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None | |
slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None | |
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None | |
slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None | |
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None | |
slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None | |
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None | |
slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None | |
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None | |
slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None | |
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None | |
slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None | |
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None | |
slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None | |
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None | |
slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None | |
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None | |
slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None | |
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None | |
slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None | |
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None | |
slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None | |
return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24) | |
V0627 17:31:04.236000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] | |
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None | |
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None | |
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None | |
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None | |
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None | |
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None | |
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None | |
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None | |
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None | |
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None | |
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None | |
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None | |
return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23) | |
V0627 17:31:04.250000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ki/ckiwvz5uks7efnnd5ew4d76276bcutmxkkfh6ozi5dgumqv3m2qc.py"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "6641116284eedbc64e23effbbbfe40e6"} | |
# AOT ID: ['7_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args | |
args.clear() | |
assert_size_stride(arg0_1, (13, 3), (3, 1)) | |
assert_size_stride(arg1_1, (13, 3), (3, 1)) | |
assert_size_stride(arg2_1, (13, 3), (3, 1)) | |
assert_size_stride(arg3_1, (13, 3), (3, 1)) | |
assert_size_stride(arg4_1, (13, 3), (3, 1)) | |
assert_size_stride(arg5_1, (13, 3), (3, 1)) | |
assert_size_stride(arg6_1, (13, 3), (3, 1)) | |
assert_size_stride(arg7_1, (13, 3), (3, 1)) | |
assert_size_stride(arg8_1, (13, 3), (3, 1)) | |
assert_size_stride(arg9_1, (13, 3), (3, 1)) | |
assert_size_stride(arg10_1, (13, 3), (3, 1)) | |
assert_size_stride(arg11_1, (13, 3), (3, 1)) | |
return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:04.258000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:04.259000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "bb9e1aaf4decc7f300fbb51ff6f34967"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0) | |
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400) | |
| | +- LENGTH_CHECK: len(L['___stack0']) == 12 | |
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads) | |
| | +- EQUALS_MATCH: L['num_heads'] == 12 | |
| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks) | |
| | +- EQUALS_MATCH: L['num_blocks'] == 13 | |
| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top) | |
| | +- EQUALS_MATCH: L['global_block_top'] == 1 | |
| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom) | |
| | +- EQUALS_MATCH: L['global_block_bottom'] == 1 | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['__builtins_dict___44'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___44) | |
| | | +- GuardManager: source=G['__builtins_dict___44']['range'], accessed_by=DictGetItemGuardAccessor(range) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___44']['range'], 7632448) | |
V0627 17:31:04.259000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "14/0", "frame_key": "19", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534664.1278515, "entire_frame_compile_time_s": 0.13145732879638672, "backend_compile_time_s": 0.09916210174560547, "inductor_compile_time_s": 0.022524356842041016, "code_gen_time_s": 0.003596067428588867, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.260000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b7060>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ad1c0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c810>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119ef70>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c58a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1d00>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b9f30>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ac310>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310bb100>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4c450>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310f5e90>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 12, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c6840>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 13, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.283000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 13, "describer_id": 38, "size": 3328}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.283000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 17, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.284000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.284000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 16, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.293000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 14, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.294000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 31, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c73d0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.294000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 31, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 15, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 32, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c7470>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 32, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.316000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 50, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.317000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 50, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 16, "describer_id": 38, "size": 442368}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 124, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 124, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.439000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 182, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.439000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 182, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:04.456000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"): | |
l_stack0_0_ = L_stack0_0_ | |
l_stack0_1_ = L_stack0_1_ | |
l_stack0_2_ = L_stack0_2_ | |
l_stack0_3_ = L_stack0_3_ | |
l_stack0_4_ = L_stack0_4_ | |
l_stack0_5_ = L_stack0_5_ | |
l_stack0_6_ = L_stack0_6_ | |
l_stack0_7_ = L_stack0_7_ | |
l_stack0_8_ = L_stack0_8_ | |
l_stack0_9_ = L_stack0_9_ | |
l_stack0_10_ = L_stack0_10_ | |
l_stack0_11_ = L_stack0_11_ | |
l_query_layer_ = L_query_layer_ | |
l_from_blocked_mask_ = L_from_blocked_mask_ | |
l_key_layer_ = L_key_layer_ | |
l_value_layer_ = L_value_layer_ | |
l_to_mask_ = L_to_mask_ | |
l_band_mask_ = L_band_mask_ | |
l_from_mask_ = L_from_mask_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0) | |
rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0); l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) | |
rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64); rand_attn = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0) | |
unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) | |
rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.cat([rand_attn_1], dim = 0); rand_attn_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0] | |
i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
flatten: "i64[396][1]cpu" = i1.flatten(); i1 = None | |
getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten]; p1 = flatten = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]); getitem_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) | |
rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192); rand_mask = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) | |
getitem_3: "f32[1, 11, 64][832, 64, 1]cpu" = l_from_blocked_mask_[(slice(None, None, None), slice(1, -1, None))]; l_from_blocked_mask_ = None | |
rand_mask_2: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.functional.einsum('blq,bhlk->bhlqk', getitem_3, rand_mask_1); getitem_3 = rand_mask_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) | |
blocked_query_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_query_layer_.view(1, 12, 13, 64, -1); l_query_layer_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) | |
blocked_key_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_key_layer_.view(1, 12, 13, 64, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) | |
blocked_value_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_value_layer_.view(1, 12, 13, 64, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) | |
shift: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu')) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from | |
div: "i64[396][1]cpu" = torch.div(shift, 33, rounding_mode = 'floor'); shift = None | |
indices_shift: "i64[396][1]cpu" = div * 13; div = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift | |
view_4: "i64[396][1]cpu" = rand_attn_2.view(-1) | |
flattened_indices: "i64[396][1]cpu" = view_4 + indices_shift; view_4 = indices_shift = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) | |
flattened_params: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_key_matrix.reshape(-1, 64, 64) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices) | |
out_flattened: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params.index_select(0, flattened_indices); flattened_params = flattened_indices = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) | |
out: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened.reshape((1, 12, 33, 64, 64)); out_flattened = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view( | |
gathered_key: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out.view(1, 12, 11, 192, -1); out = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) | |
shift_1: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu')) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from | |
div_1: "i64[396][1]cpu" = torch.div(shift_1, 33, rounding_mode = 'floor'); shift_1 = None | |
indices_shift_1: "i64[396][1]cpu" = div_1 * 13; div_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift | |
view_6: "i64[396][1]cpu" = rand_attn_2.view(-1) | |
flattened_indices_1: "i64[396][1]cpu" = view_6 + indices_shift_1; view_6 = indices_shift_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) | |
flattened_params_1: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_value_matrix.reshape(-1, 64, 64) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices) | |
out_flattened_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params_1.index_select(0, flattened_indices_1); flattened_params_1 = flattened_indices_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) | |
out_1: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened_1.reshape((1, 12, 33, 64, 64)); out_flattened_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view( | |
gathered_value: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out_1.view(1, 12, 11, 192, -1); out_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) | |
getitem_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
reshape_4: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_4.reshape((-1, 64, 64)); getitem_4 = None | |
reshape_5: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64)) | |
transpose: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_5.transpose(1, 2); reshape_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_4, transpose); reshape_4 = transpose = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
first_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm.view((1, 12, 64, 832)); bmm = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d | |
first_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product * 0.125; first_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty | |
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_ | |
mul_3: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub * -10000.0; sub = None | |
first_product_1 += mul_3; first_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product_1; first_product_1 = mul_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax( | |
first_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(first_product_2, dim = -1); first_product_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
reshape_6: "bf16[12, 64, 832][53248, 832, 1]cpu" = first_attn_weights.reshape((-1, 64, 832)); first_attn_weights = None | |
reshape_7: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64)) | |
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_6, reshape_7); reshape_6 = reshape_7 = None | |
first_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_1.view((1, 12, 64, 64)); bmm_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2) | |
unsqueeze__1: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = first_context_layer.unsqueeze_(2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0], | |
getitem_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1], | |
getitem_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 1)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2], | |
getitem_7: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 2)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1], | |
getitem_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0], | |
getitem_9: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat( | |
second_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_5, getitem_6, getitem_7, getitem_8, getitem_9], dim = 2); getitem_5 = getitem_6 = getitem_7 = getitem_8 = getitem_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0], | |
getitem_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1], | |
getitem_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 1)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2], | |
getitem_12: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 2)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1], | |
getitem_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0], | |
getitem_14: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat( | |
second_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_10, getitem_11, getitem_12, getitem_13, getitem_14], dim = 2); getitem_10 = getitem_11 = getitem_12 = getitem_13 = getitem_14 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) | |
getitem_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 1)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
reshape_8: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_15.reshape((-1, 64, 64)); getitem_15 = None | |
reshape_9: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_key_mat.reshape((-1, 448, 64)); second_key_mat = None | |
transpose_1: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_9.transpose(1, 2); reshape_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_8, transpose_1); reshape_8 = transpose_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
second_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_2.view((1, 12, 64, 448)); bmm_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size], | |
getitem_16: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 192, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:], | |
getitem_17: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), | |
new_ones: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat( | |
second_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.cat([getitem_16, getitem_17, new_ones], dim = 3); getitem_16 = getitem_17 = new_ones = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), | |
new_ones_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0], | |
getitem_18: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat( | |
second_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.cat([new_ones_1, getitem_18], dim = 3); new_ones_1 = getitem_18 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d | |
second_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product * 0.125; second_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty | |
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_seq_pad, second_rand_pad); second_seq_pad = second_rand_pad = None | |
sub_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum; minimum = None | |
mul_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_1 * -10000.0; sub_1 = None | |
second_product_1 += mul_5; second_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product_1; second_product_1 = mul_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax( | |
second_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_product_2, dim = -1); second_product_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
reshape_10: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_attn_weights.reshape((-1, 64, 448)); second_attn_weights = None | |
reshape_11: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_value_mat.reshape((-1, 448, 64)); second_value_mat = None | |
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_10, reshape_11); reshape_10 = reshape_11 = None | |
second_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_3.view((1, 12, 64, 64)); bmm_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2) | |
unsqueeze__2: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_context_layer.unsqueeze_(2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 | |
getitem_19: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))] | |
getitem_20: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))] | |
getitem_21: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat( | |
exp_blocked_key_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.cat([getitem_19, getitem_20, getitem_21], dim = 3); getitem_19 = getitem_20 = getitem_21 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], | |
getitem_22: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))] | |
getitem_23: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))] | |
getitem_24: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat( | |
exp_blocked_value_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.cat([getitem_22, getitem_23, getitem_24], dim = 3); getitem_22 = getitem_23 = getitem_24 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2] | |
middle_query_matrix: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
reshape_12: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64)) | |
reshape_13: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_key_matrix.reshape((-1, 192, 64)); exp_blocked_key_matrix = None | |
transpose_2: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_13.transpose(1, 2); reshape_13 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_12, transpose_2); reshape_12 = transpose_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
inner_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_4.view((1, 12, 9, 64, 192)); bmm_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d | |
inner_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product * 0.125; inner_band_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) | |
getitem_26: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
reshape_14: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64)) | |
reshape_15: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_26.reshape((-1, 192, 64)); getitem_26 = None | |
transpose_3: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_15.transpose(1, 2); reshape_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_14, transpose_3); reshape_14 = transpose_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
rand_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_5.view((1, 12, 9, 64, 192)); bmm_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d | |
rand_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product * 0.125; rand_band_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:718 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0] | |
getitem_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum( | |
first_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_27); getitem_27 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d | |
first_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product * 0.125; first_band_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:724 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1] | |
getitem_28: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum( | |
last_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_28); middle_query_matrix = getitem_28 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d | |
last_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product * 0.125; last_band_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty | |
sub_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = 1.0 - l_band_mask_; l_band_mask_ = None | |
mul_10: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = sub_2 * -10000.0; sub_2 = None | |
inner_band_product_1 += mul_10; inner_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product_1; inner_band_product_1 = mul_10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty | |
getitem_29: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))] | |
unsqueeze: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_29.unsqueeze(3); getitem_29 = None | |
sub_3: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze; unsqueeze = None | |
mul_11: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_3 * -10000.0; sub_3 = None | |
first_band_product_1 += mul_11; first_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product_1; first_band_product_1 = mul_11 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty | |
getitem_30: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))] | |
unsqueeze_1: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_30.unsqueeze(3); getitem_30 = None | |
sub_4: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze_1; unsqueeze_1 = None | |
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_4 * -10000.0; sub_4 = None | |
last_band_product_1 += mul_12; last_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product_1; last_band_product_1 = mul_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty | |
getitem_31: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))] | |
sub_5: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = 1.0 - getitem_31; getitem_31 = None | |
mul_13: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = sub_5 * -10000.0; sub_5 = None | |
rand_band_product_1 += mul_13; rand_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product_1; rand_band_product_1 = mul_13 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat( | |
band_product: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.cat([first_band_product_2, inner_band_product_2, rand_band_product_2, last_band_product_2], dim = -1); first_band_product_2 = inner_band_product_2 = rand_band_product_2 = last_band_product_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax( | |
attn_weights: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.nn.functional.softmax(band_product, dim = -1); band_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 | |
getitem_32: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(64, 256, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
reshape_16: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_32.reshape((-1, 64, 192)); getitem_32 = None | |
reshape_17: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_value_matrix.reshape((-1, 192, 64)); exp_blocked_value_matrix = None | |
bmm_6: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_16, reshape_17); reshape_16 = reshape_17 = None | |
context_layer: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_6.view((1, 12, 9, 64, 64)); bmm_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 | |
getitem_33: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(256, -64, None))] | |
getitem_34: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
reshape_18: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_33.reshape((-1, 64, 192)); getitem_33 = None | |
reshape_19: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_34.reshape((-1, 192, 64)); getitem_34 = None | |
bmm_7: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_18, reshape_19); reshape_18 = reshape_19 = None | |
view_15: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_7.view((1, 12, 9, 64, 64)); bmm_7 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd( | |
context_layer += view_15; context_layer_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer; context_layer = view_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] | |
getitem_35: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))] | |
getitem_36: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum( | |
einsum_3: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_35, getitem_36); getitem_35 = getitem_36 = None | |
context_layer_1 += einsum_3; context_layer_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_1; context_layer_1 = einsum_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] | |
getitem_37: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]; attn_weights = None | |
getitem_38: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum( | |
einsum_4: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_37, getitem_38); getitem_37 = getitem_38 = None | |
context_layer_2 += einsum_4; context_layer_3: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_2; context_layer_2 = einsum_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:775 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0], | |
getitem_39: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3], | |
getitem_40: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -3)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2], | |
getitem_41: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -2)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:778 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1], | |
getitem_42: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_key_matrix = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1], | |
getitem_43: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), -1)]; gathered_key = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat( | |
second_last_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_39, getitem_40, getitem_41, getitem_42, getitem_43], dim = 2); getitem_39 = getitem_40 = getitem_41 = getitem_42 = getitem_43 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:785 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0], | |
getitem_44: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3], | |
getitem_45: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -3)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2], | |
getitem_46: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -2)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:788 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1], | |
getitem_47: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_value_matrix = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1], | |
getitem_48: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), -1)]; gathered_value = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat( | |
second_last_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_44, getitem_45, getitem_46, getitem_47, getitem_48], dim = 2); getitem_44 = getitem_45 = getitem_46 = getitem_47 = getitem_48 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) | |
getitem_49: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -2)] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
reshape_20: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_49.reshape((-1, 64, 64)); getitem_49 = None | |
reshape_21: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_key_mat.reshape((-1, 448, 64)); second_last_key_mat = None | |
transpose_4: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_21.transpose(1, 2); reshape_21 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_8: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_20, transpose_4); reshape_20 = transpose_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
second_last_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_8.view((1, 12, 64, 448)); bmm_8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:798 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size], | |
getitem_50: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :], | |
getitem_51: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-192, None, None))] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:800 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), | |
new_ones_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat( | |
second_last_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.cat([getitem_50, getitem_51, new_ones_2], dim = 3); getitem_50 = getitem_51 = new_ones_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:806 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), | |
new_ones_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1], | |
getitem_52: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), -1)]; rand_mask_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat( | |
second_last_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.cat([new_ones_3, getitem_52], dim = 3); new_ones_3 = getitem_52 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d | |
second_last_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product * 0.125; second_last_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty | |
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_last_seq_pad, second_last_rand_pad); second_last_seq_pad = second_last_rand_pad = None | |
sub_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum_1; minimum_1 = None | |
mul_15: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_6 * -10000.0; sub_6 = None | |
second_last_product_1 += mul_15; second_last_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product_1; second_last_product_1 = mul_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax( | |
second_last_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_last_product_2, dim = -1); second_last_product_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
reshape_22: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_last_attn_weights.reshape((-1, 64, 448)); second_last_attn_weights = None | |
reshape_23: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_value_mat.reshape((-1, 448, 64)); second_last_value_mat = None | |
bmm_9: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_22, reshape_23); reshape_22 = reshape_23 = None | |
second_last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_9.view((1, 12, 64, 64)); bmm_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2) | |
unsqueeze__3: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_last_context_layer.unsqueeze_(2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) | |
getitem_53: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_query_matrix = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
reshape_24: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_53.reshape((-1, 64, 64)); getitem_53 = None | |
reshape_25: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64)); l_key_layer_ = None | |
transpose_5: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_25.transpose(1, 2); reshape_25 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_10: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_24, transpose_5); reshape_24 = transpose_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
last_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm_10.view((1, 12, 64, 832)); bmm_10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d | |
last_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product * 0.125; last_product = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty | |
sub_7: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_; l_to_mask_ = None | |
mul_17: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub_7 * -10000.0; sub_7 = None | |
last_product_1 += mul_17; last_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product_1; last_product_1 = mul_17 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] | |
last_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(last_product_2, dim = -1); last_product_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
reshape_26: "bf16[12, 64, 832][53248, 832, 1]cpu" = last_attn_weights.reshape((-1, 64, 832)); last_attn_weights = None | |
reshape_27: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64)); l_value_layer_ = None | |
bmm_11: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_26, reshape_27); reshape_26 = reshape_27 = None | |
last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_11.view((1, 12, 64, 64)); bmm_11 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2) | |
unsqueeze__4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = last_context_layer.unsqueeze_(2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat( | |
context_layer_4: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.cat([first_context_layer, second_context_layer, context_layer_3, second_last_context_layer, last_context_layer], dim = 2); first_context_layer = second_context_layer = context_layer_3 = second_last_context_layer = last_context_layer = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask | |
view_20: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = context_layer_4.view((1, 12, 832, -1)); context_layer_4 = None | |
context_layer_5: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = view_20 * l_from_mask_; view_20 = l_from_mask_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2) | |
context_layer_6: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.transpose(context_layer_5, 1, 2); context_layer_5 = None | |
return (context_layer_6, rand_attn_2) | |
V0627 17:31:05.175000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "9d92a7e58f208e3c617d3e5fb4f3ee25"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0) | |
cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None | |
view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.view.default(cat, [12, 11, 3]); cat = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) | |
alias: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(view); view = None | |
alias_1: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias); alias = None | |
alias_2: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias_1); alias_1 = None | |
convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(alias_2, torch.int64); alias_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0) | |
unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) | |
clone: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.clone.default(unsqueeze); unsqueeze = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0) | |
select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(clone, 0, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
view_1: "i64[396][1]cpu" = torch.ops.aten.view.default(select_1, [396]); select_1 = None | |
index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
clone_1: "f32[396, 64][64, 1]cpu" = torch.ops.aten.clone.default(index); index = None | |
view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.view.default(clone_1, [1, 396, 64]); clone_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) | |
view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.view.default(view_2, [1, 12, 11, 192]); view_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) | |
slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 0, 0, 9223372036854775807); arg13_1 = None | |
slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -1); slice_1 = None | |
unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None | |
unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None | |
permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None | |
unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None | |
permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None | |
mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) | |
view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) | |
view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [1, 12, 13, 64, -1]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) | |
view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [1, 12, 13, 64, -1]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) | |
iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from | |
div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None | |
mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13); div = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift | |
view_7: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1]) | |
add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) | |
clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format) | |
view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_2, [156, 64, 64]); clone_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices) | |
index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = add = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) | |
view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_1, [1, 12, 33, 64, 64]); index_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view( | |
view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_9, [1, 12, 11, 192, -1]); view_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) | |
iota_1: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from | |
div_1: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota_1, 33, rounding_mode = 'floor'); iota_1 = None | |
mul_2: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div_1, 13); div_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift | |
view_11: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1]) | |
add_1: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_11, mul_2); view_11 = mul_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) | |
clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format) | |
view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_3, [156, 64, 64]); clone_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices) | |
index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add_1]); view_12 = add_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) | |
view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_2, [1, 12, 33, 64, 64]); index_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view( | |
view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_13, [1, 12, 11, 192, -1]); view_13 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) | |
slice_3: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807) | |
slice_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None | |
select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_4, 2, 0); slice_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_2, [12, 64, 64]); select_2 = None | |
view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64]) | |
permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = permute_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm, [1, 12, 64, 832]); bmm = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d | |
mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty | |
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1) | |
mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None | |
add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = mul_4 = None | |
convert_element_type_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_2, torch.bfloat16); add_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax( | |
convert_element_type_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_3, torch.float32); convert_element_type_3 = None | |
amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_4, [-1], True) | |
sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_4, amax); convert_element_type_4 = amax = None | |
exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1); sub_1 = None | |
sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True) | |
div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None | |
convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None | |
view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64]) | |
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = view_19 = None | |
view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2) | |
unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0], | |
slice_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None | |
select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_6, 2, 0); slice_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1], | |
slice_7: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_8: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None | |
select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_8, 2, 1); slice_8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2], | |
slice_9: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_10: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None | |
select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_10, 2, 2); slice_10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1], | |
slice_11: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_12: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None | |
select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_12, 2, -1); slice_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0], | |
slice_13: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807) | |
slice_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None | |
select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_14, 2, 0); slice_14 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat( | |
cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2); select_3 = select_4 = select_5 = select_6 = select_7 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0], | |
slice_15: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_16: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None | |
select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_16, 2, 0); slice_16 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1], | |
slice_17: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_18: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None | |
select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_18, 2, 1); slice_18 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2], | |
slice_19: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_20: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None | |
select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_20, 2, 2); slice_20 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1], | |
slice_21: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_22: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None | |
select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_22, 2, -1); slice_22 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0], | |
slice_23: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807) | |
slice_24: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None | |
select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_24, 2, 0); slice_24 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat( | |
cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2); select_8 = select_9 = select_10 = select_11 = select_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) | |
slice_25: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807) | |
slice_26: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_25, 1, 0, 9223372036854775807); slice_25 = None | |
select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_26, 2, 1); slice_26 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_13, [12, 64, 64]); select_13 = None | |
view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_1, [-1, 448, 64]); cat_1 = None | |
permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size], | |
slice_27: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807) | |
slice_28: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_27, 1, 0, 9223372036854775807); slice_27 = None | |
slice_29: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_28, 2, 0, 9223372036854775807); slice_28 = None | |
slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_29, 3, 0, 192); slice_29 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:], | |
slice_31: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807) | |
slice_32: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_31, 1, 0, 9223372036854775807); slice_31 = None | |
slice_33: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_32, 2, 0, 9223372036854775807); slice_32 = None | |
slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_33, 3, -64, 9223372036854775807); slice_33 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), | |
full: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat( | |
cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full], 3); slice_30 = slice_34 = full = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), | |
full_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0], | |
slice_35: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807) | |
slice_36: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_35, 1, 0, 9223372036854775807); slice_35 = None | |
select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(slice_36, 2, 0); slice_36 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat( | |
cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_1, select_14], 3); full_1 = select_14 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d | |
mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty | |
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None | |
sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None | |
mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None | |
add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None | |
convert_element_type_10: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_3, torch.bfloat16); add_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax( | |
convert_element_type_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_10, torch.float32); convert_element_type_10 = None | |
amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_11, [-1], True) | |
sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_11, amax_1); convert_element_type_11 = amax_1 = None | |
exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3); sub_3 = None | |
sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True) | |
div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None | |
convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None | |
view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_2, [-1, 448, 64]); cat_2 = None | |
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None | |
view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2) | |
unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 | |
slice_37: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_38: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_37, 1, 0, 9223372036854775807); slice_37 = None | |
slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_38, 2, 1, -3); slice_38 = None | |
slice_40: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_41: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_40, 1, 0, 9223372036854775807); slice_40 = None | |
slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_41, 2, 2, -2); slice_41 = None | |
slice_43: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_44: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_43, 1, 0, 9223372036854775807); slice_43 = None | |
slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_44, 2, 3, -1); slice_44 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat( | |
cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], | |
slice_46: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_47: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_46, 1, 0, 9223372036854775807); slice_46 = None | |
slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_47, 2, 1, -3); slice_47 = None | |
slice_49: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_50: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_49, 1, 0, 9223372036854775807); slice_49 = None | |
slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_50, 2, 2, -2); slice_50 = None | |
slice_52: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_53: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_52, 1, 0, 9223372036854775807); slice_52 = None | |
slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_53, 2, 3, -1); slice_53 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat( | |
cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2] | |
slice_55: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807) | |
slice_56: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_55, 1, 0, 9223372036854775807); slice_55 = None | |
slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_56, 2, 2, -2); slice_56 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format) | |
view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_4, [108, 64, 64]); clone_4 = None | |
view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_5, [-1, 192, 64]); cat_5 = None | |
permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4); view_27 = permute_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d | |
mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) | |
slice_58: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807) | |
slice_59: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_58, 1, 0, 9223372036854775807); slice_58 = None | |
slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_59, 2, 1, -1); slice_59 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
clone_5: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format) | |
view_30: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_5, [108, 64, 64]); clone_5 = None | |
clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None | |
view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_6, [108, 192, 64]); clone_6 = None | |
permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_30, permute_5); view_30 = permute_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d | |
mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:718 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0] | |
slice_61: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_62: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_61, 1, 0, 9223372036854775807); slice_61 = None | |
select_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_62, 2, 0); slice_62 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum( | |
unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5) | |
permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None | |
unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_15, 4); select_15 = None | |
unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None | |
permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None | |
permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None | |
view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_8, [12, 576, 64]); permute_8 = None | |
permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None | |
view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_9, [12, 64, 64]); permute_9 = None | |
bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34); view_33 = view_34 = None | |
view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None | |
permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None | |
view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d | |
mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:724 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1] | |
slice_63: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_64: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_63, 1, 0, 9223372036854775807); slice_63 = None | |
select_16: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_64, 2, -1); slice_64 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum( | |
unsqueeze_9: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5); slice_57 = None | |
permute_11: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_9, [0, 1, 2, 3, 5, 4]); unsqueeze_9 = None | |
unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_16, 4); select_16 = None | |
unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None | |
permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None | |
permute_13: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_11, [1, 2, 3, 5, 0, 4]); permute_11 = None | |
view_37: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_13, [12, 576, 64]); permute_13 = None | |
permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None | |
view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_14, [12, 64, 64]); permute_14 = None | |
bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_37, view_38); view_37 = view_38 = None | |
view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None | |
permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None | |
view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d | |
mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty | |
sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None | |
mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None | |
add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None | |
convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty | |
slice_65: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807) | |
slice_66: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_65, 1, 0, 9223372036854775807); slice_65 = None | |
slice_67: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_66, 2, 0, 9223372036854775807); slice_66 = None | |
slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_67, 3, 0, 64); slice_67 = None | |
unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3); slice_68 = None | |
sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None | |
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None | |
add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None | |
convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty | |
slice_69: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807) | |
slice_70: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_69, 1, 0, 9223372036854775807); slice_69 = None | |
slice_71: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_70, 2, 0, 9223372036854775807); slice_70 = None | |
slice_72: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_71, 3, -64, 9223372036854775807); slice_71 = None | |
unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_72, 3); slice_72 = None | |
sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None | |
mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None | |
add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None | |
convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty | |
slice_73: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807) | |
slice_74: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_73, 1, 0, 9223372036854775807); slice_73 = None | |
slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_74, 2, 1, -1); slice_74 = None | |
sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None | |
mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None | |
add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None | |
convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat( | |
cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax( | |
convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None | |
amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True) | |
sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None | |
exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8); sub_8 = None | |
sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True) | |
div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None | |
convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 | |
slice_76: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807) | |
slice_77: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_76, 1, 0, 9223372036854775807); slice_76 = None | |
slice_78: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_77, 2, 0, 9223372036854775807); slice_77 = None | |
slice_79: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_78, 3, 0, 9223372036854775807); slice_78 = None | |
slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_79, 4, 64, 256); slice_79 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_80, [108, 64, 192]); slice_80 = None | |
view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_6, [-1, 192, 64]); cat_6 = None | |
bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None | |
view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 | |
slice_81: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807) | |
slice_82: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_81, 1, 0, 9223372036854775807); slice_81 = None | |
slice_83: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_82, 2, 0, 9223372036854775807); slice_82 = None | |
slice_84: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_83, 3, 0, 9223372036854775807); slice_83 = None | |
slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_84, 4, 256, -64); slice_84 = None | |
slice_86: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807) | |
slice_87: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_86, 1, 0, 9223372036854775807); slice_86 = None | |
slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_87, 2, 1, -1); slice_87 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_85, [108, 64, 192]); slice_85 = None | |
clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None | |
view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_7, [108, 192, 64]); clone_7 = None | |
bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None | |
view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd( | |
add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None | |
view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_8, [108, 64, 64]); add_8 = None | |
view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_47, [1, 12, 9, 64, 64]); view_47 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] | |
slice_89: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807) | |
slice_90: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_89, 1, 0, 9223372036854775807); slice_89 = None | |
slice_91: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_90, 2, 0, 9223372036854775807); slice_90 = None | |
slice_92: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_91, 3, 0, 9223372036854775807); slice_91 = None | |
slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_92, 4, 0, 64); slice_92 = None | |
slice_94: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_95: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_94, 1, 0, 9223372036854775807); slice_94 = None | |
select_17: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_95, 2, 0); slice_95 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum( | |
unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None | |
permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None | |
unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_17, 4); select_17 = None | |
unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None | |
permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None | |
permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None | |
view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_18, [12, 576, 64]); permute_18 = None | |
permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None | |
view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_19, [12, 64, 64]); permute_19 = None | |
bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None | |
view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None | |
permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None | |
view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None | |
add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None | |
view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_9, [108, 64, 64]); add_9 = None | |
view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_53, [1, 12, 9, 64, 64]); view_53 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] | |
slice_96: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807); convert_element_type_28 = None | |
slice_97: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_96, 1, 0, 9223372036854775807); slice_96 = None | |
slice_98: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_97, 2, 0, 9223372036854775807); slice_97 = None | |
slice_99: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_98, 3, 0, 9223372036854775807); slice_98 = None | |
slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_99, 4, -64, 9223372036854775807); slice_99 = None | |
slice_101: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_102: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_101, 1, 0, 9223372036854775807); slice_101 = None | |
select_18: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_102, 2, -1); slice_102 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum( | |
unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None | |
permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None | |
unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_18, 4); select_18 = None | |
unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None | |
permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None | |
permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None | |
view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_23, [12, 576, 64]); permute_23 = None | |
permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None | |
view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_24, [12, 64, 64]); permute_24 = None | |
bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None | |
view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None | |
permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None | |
view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None | |
add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None | |
view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_10, [108, 64, 64]); add_10 = None | |
view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_59, [1, 12, 9, 64, 64]); view_59 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:775 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0], | |
slice_103: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_104: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_103, 1, 0, 9223372036854775807); slice_103 = None | |
select_19: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_104, 2, 0); slice_104 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3], | |
slice_105: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_106: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_105, 1, 0, 9223372036854775807); slice_105 = None | |
select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_106, 2, -3); slice_106 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2], | |
slice_107: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807) | |
slice_108: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_107, 1, 0, 9223372036854775807); slice_107 = None | |
select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_108, 2, -2); slice_108 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:778 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1], | |
slice_109: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807); view_5 = None | |
slice_110: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_109, 1, 0, 9223372036854775807); slice_109 = None | |
select_22: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_110, 2, -1); slice_110 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1], | |
slice_111: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807); view_10 = None | |
slice_112: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_111, 1, 0, 9223372036854775807); slice_111 = None | |
select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_112, 2, -1); slice_112 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat( | |
cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_19, select_20, select_21, select_22, select_23], 2); select_19 = select_20 = select_21 = select_22 = select_23 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:785 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0], | |
slice_113: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_114: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_113, 1, 0, 9223372036854775807); slice_113 = None | |
select_24: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_114, 2, 0); slice_114 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3], | |
slice_115: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_116: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_115, 1, 0, 9223372036854775807); slice_115 = None | |
select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_116, 2, -3); slice_116 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2], | |
slice_117: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807) | |
slice_118: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_117, 1, 0, 9223372036854775807); slice_117 = None | |
select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_118, 2, -2); slice_118 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:788 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1], | |
slice_119: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807); view_6 = None | |
slice_120: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_119, 1, 0, 9223372036854775807); slice_119 = None | |
select_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_120, 2, -1); slice_120 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1], | |
slice_121: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807); view_14 = None | |
slice_122: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_121, 1, 0, 9223372036854775807); slice_121 = None | |
select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_122, 2, -1); slice_122 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat( | |
cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_24, select_25, select_26, select_27, select_28], 2); select_24 = select_25 = select_26 = select_27 = select_28 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) | |
slice_123: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807) | |
slice_124: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_123, 1, 0, 9223372036854775807); slice_123 = None | |
select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_124, 2, -2); slice_124 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_29, [12, 64, 64]); select_29 = None | |
view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_8, [-1, 448, 64]); cat_8 = None | |
permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:798 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size], | |
slice_125: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807) | |
slice_126: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_125, 1, 0, 9223372036854775807); slice_125 = None | |
slice_127: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_126, 2, 0, 9223372036854775807); slice_126 = None | |
slice_128: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_127, 3, 0, 64); slice_127 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :], | |
slice_129: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807) | |
slice_130: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_129, 1, 0, 9223372036854775807); slice_129 = None | |
slice_131: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_130, 2, 0, 9223372036854775807); slice_130 = None | |
slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_131, 3, -192, 9223372036854775807); slice_131 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:800 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), | |
full_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat( | |
cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_128, slice_132, full_2], 3); slice_128 = slice_132 = full_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:806 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), | |
full_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1], | |
slice_133: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807); mul = None | |
slice_134: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_133, 1, 0, 9223372036854775807); slice_133 = None | |
select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(slice_134, 2, -1); slice_134 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat( | |
cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_3, select_30], 3); full_3 = select_30 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d | |
mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty | |
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None | |
sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None | |
mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None | |
add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None | |
convert_element_type_39: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_11, torch.bfloat16); add_11 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax( | |
convert_element_type_40: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_39, torch.float32); convert_element_type_39 = None | |
amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_40, [-1], True) | |
sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_40, amax_3); convert_element_type_40 = amax_3 = None | |
exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10); sub_10 = None | |
sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True) | |
div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None | |
convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None | |
view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_9, [-1, 448, 64]); cat_9 = None | |
bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None | |
view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2) | |
unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) | |
slice_135: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807); view_4 = None | |
slice_136: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_135, 1, 0, 9223372036854775807); slice_135 = None | |
select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_136, 2, -1); slice_136 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_31, [12, 64, 64]); select_31 = None | |
view_68: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64]); arg14_1 = None | |
permute_27: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_68, [0, 2, 1]); view_68 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_27); view_67 = permute_27 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d | |
mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty | |
sub_11: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1); arg16_1 = None | |
mul_18: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub_11, -10000.0); sub_11 = None | |
add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_18); mul_17 = mul_18 = None | |
convert_element_type_46: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_12, torch.bfloat16); add_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] | |
convert_element_type_47: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_46, torch.float32); convert_element_type_46 = None | |
amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_47, [-1], True) | |
sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_47, amax_4); convert_element_type_47 = amax_4 = None | |
exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12); sub_12 = None | |
sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True) | |
div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None | |
convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None | |
view_71: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64]); arg15_1 = None | |
bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_71); view_70 = view_71 = None | |
view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2) | |
unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat( | |
cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask | |
view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.view.default(cat_12, [1, 12, 832, -1]); cat_12 = None | |
mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2) | |
permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None | |
return (permute_28, clone) | |
V0627 17:31:05.639000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "ee4f5da4b7396f62d53589c7ddc358c5"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) | |
view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) | |
select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]); select_2 = None | |
view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64]) | |
permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]); bmm = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d | |
mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty | |
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1) | |
mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None | |
add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax( | |
amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_2, [-1], True) | |
sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, amax); add_2 = amax = None | |
exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1); sub_1 = None | |
sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True) | |
div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None | |
convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None | |
view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64]) | |
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = None | |
view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2) | |
unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) | |
select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]); select_13 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) | |
view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1]); arg14_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0], | |
select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1], | |
select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2], | |
select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1], | |
select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) | |
clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format) | |
view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]); clone_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0) | |
cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None | |
view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.reshape.default(cat, [12, 11, 3]); cat = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) | |
convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(view, torch.int64); view = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0) | |
unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift | |
view_7: "i64[396][1]cpu" = torch.ops.aten.reshape.default(unsqueeze, [-1]) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) | |
iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from | |
div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None | |
mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13); div = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift | |
add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices) | |
index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) | |
view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]); index_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view( | |
view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]); view_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0], | |
select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat( | |
cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2); select_4 = select_5 = select_7 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]); cat_1 = None | |
permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d | |
mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size], | |
slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:], | |
slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), | |
full_default: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat( | |
cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full_default], 3); slice_30 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), | |
full_default_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) | |
slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1) | |
unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None | |
unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None | |
permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0); arg13_1 = None | |
select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(unsqueeze, 0, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
view_1: "i64[396][1]cpu" = torch.ops.aten.reshape.default(select_1, [396]); select_1 = None | |
index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.reshape.default(index, [1, 396, 64]); index = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) | |
view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]); view_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) | |
unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None | |
permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None | |
mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0], | |
select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat( | |
cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_14], 3); select_14 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty | |
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None | |
sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None | |
mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None | |
add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax( | |
amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_3, [-1], True) | |
sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_3, amax_1); add_3 = amax_1 = None | |
exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3); sub_3 = None | |
sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True) | |
div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None | |
convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) | |
view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1]); arg15_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0], | |
select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1], | |
select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2], | |
select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1], | |
select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) | |
clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format) | |
view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]); clone_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices) | |
index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add]); view_12 = add = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) | |
view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]); index_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view( | |
view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]); view_13 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0], | |
select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, 0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat( | |
cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2); select_9 = select_10 = select_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]); cat_2 = None | |
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None | |
view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2) | |
unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2] | |
slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum( | |
unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5) | |
permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None | |
permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None | |
view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]); permute_8 = None | |
unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_3, 4) | |
unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None | |
permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None | |
permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None | |
view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]); permute_9 = None | |
bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34); view_34 = None | |
view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None | |
permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None | |
view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d | |
mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty | |
slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64) | |
unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3) | |
sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None | |
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None | |
add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None | |
convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format); slice_57 = None | |
view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]); clone_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 | |
slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3) | |
slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2) | |
slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat( | |
cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]); cat_5 = None | |
permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4); permute_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d | |
mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty | |
sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None | |
mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None | |
add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None | |
convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) | |
slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None | |
view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]); clone_6 = None | |
permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_5); view_27 = permute_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d | |
mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty | |
slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1) | |
sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None | |
mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None | |
add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None | |
convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum( | |
unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_6, 4) | |
unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None | |
permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None | |
permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None | |
view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]); permute_14 = None | |
bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_38); view_33 = view_38 = None | |
view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None | |
permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None | |
view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d | |
mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty | |
unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_34, 3); slice_34 = None | |
sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None | |
mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None | |
add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None | |
convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat( | |
cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax( | |
convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None | |
amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True) | |
sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None | |
exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8); sub_8 = None | |
sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True) | |
div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None | |
convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 | |
slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]); slice_80 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], | |
slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3) | |
slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2) | |
slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat( | |
cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]); cat_6 = None | |
bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None | |
view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 | |
slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]); slice_85 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 | |
slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None | |
view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]); clone_7 = None | |
bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None | |
view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd( | |
add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None | |
view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]); add_8 = None | |
view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]); view_47 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] | |
slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum( | |
unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None | |
permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None | |
permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None | |
view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]); permute_18 = None | |
unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_8, 4) | |
unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None | |
permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None | |
permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None | |
view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]); permute_19 = None | |
bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None | |
view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None | |
permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None | |
view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None | |
add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None | |
view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]); add_9 = None | |
view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]); view_53 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] | |
slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807); convert_element_type_28 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum( | |
unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None | |
permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None | |
permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None | |
view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]); permute_23 = None | |
unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_11, 4) | |
unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None | |
permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None | |
permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None | |
view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]); permute_24 = None | |
bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None | |
view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None | |
permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None | |
view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None | |
add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None | |
view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]); add_10 = None | |
view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]); view_59 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) | |
select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -2) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]); select_29 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3], | |
select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -3) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2], | |
select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -2); view_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1], | |
select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, -1); view_10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat( | |
cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_20, select_21, select_6, select_23], 2); select_3 = select_20 = select_21 = select_6 = select_23 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]); cat_8 = None | |
permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d | |
mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :], | |
slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807); arg16_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat( | |
cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_68, slice_132, full_default], 3); slice_68 = slice_132 = full_default = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1], | |
select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, -1); mul = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat( | |
cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_30], 3); full_default_1 = select_30 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty | |
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None | |
sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None | |
mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None | |
add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax( | |
amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_11, [-1], True) | |
sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_11, amax_3); add_11 = amax_3 = None | |
exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10); sub_10 = None | |
sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True) | |
div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None | |
convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3], | |
select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -3) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2], | |
select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -2); view_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1], | |
select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, -1); view_14 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat( | |
cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_25, select_26, select_11, select_28], 2); select_8 = select_25 = select_26 = select_11 = select_28 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]); cat_9 = None | |
bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None | |
view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2) | |
unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) | |
select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -1); view_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) | |
view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]); select_31 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm( | |
bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_2); view_67 = permute_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) | |
view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d | |
mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty | |
add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_4); mul_17 = mul_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] | |
amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_12, [-1], True) | |
sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_12, amax_4); add_12 = amax_4 = None | |
exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12); sub_12 = None | |
sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True) | |
div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None | |
convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( | |
view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None | |
bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_19); view_70 = view_19 = None | |
view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2) | |
unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat( | |
cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask | |
view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]); cat_12 = None | |
mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2) | |
permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None | |
return (permute_28, unsqueeze) | |
V0627 17:31:09.541000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/6i/c6icdm2jkh5xkxrgpyz2vtbd5oehca45dznneh7n63f3sirkkptn.py"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "47217ba55691917867319806954aafb8"} | |
# AOT ID: ['8_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
cpp_fused__softmax_add_mul_rsub_0 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
const float* in_ptr1, | |
float* out_ptr0, | |
float* out_ptr1, | |
float* out_ptr2, | |
bfloat16* out_ptr3) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
float tmp_acc0 = -std::numeric_limits<float>::infinity(); | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity()); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp7 = static_cast<float>(1.0); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp8 - tmp6; | |
auto tmp10 = static_cast<float>(-10000.0); | |
auto tmp11 = at::vec::Vectorized<float>(tmp10); | |
auto tmp12 = tmp9 * tmp11; | |
auto tmp13 = tmp5 + tmp12; | |
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13); | |
} | |
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec)); | |
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
{ | |
float tmp_acc0 = 0; | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16); | |
auto tmp14 = out_ptr0[static_cast<long>(x0)]; | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp7 = static_cast<float>(1.0); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp8 - tmp6; | |
auto tmp10 = static_cast<float>(-10000.0); | |
auto tmp11 = at::vec::Vectorized<float>(tmp10); | |
auto tmp12 = tmp9 * tmp11; | |
auto tmp13 = tmp5 + tmp12; | |
auto tmp15 = at::vec::Vectorized<float>(tmp14); | |
auto tmp16 = tmp13 - tmp15; | |
auto tmp17 = tmp16.exp(); | |
tmp17.store(out_ptr1 + static_cast<long>(x1 + (832L*x0))); | |
tmp_acc0_vec = tmp_acc0_vec + tmp17; | |
} | |
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec); | |
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16); | |
auto tmp1 = out_ptr2[static_cast<long>(x0)]; | |
auto tmp2 = at::vec::Vectorized<float>(tmp1); | |
auto tmp3 = tmp0 / tmp2; | |
auto tmp4 = at::vec::convert<bfloat16>(tmp3); | |
tmp4.store(out_ptr3 + static_cast<long>(x1 + (832L*x0)), 16); | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused__to_copy_cat_stack_1 = async_compile.cpp_pybinding(['const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const bfloat16*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int64_t*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const int32_t* in_ptr0, | |
const int32_t* in_ptr1, | |
const int32_t* in_ptr2, | |
const int32_t* in_ptr3, | |
const int32_t* in_ptr4, | |
const int32_t* in_ptr5, | |
const int32_t* in_ptr6, | |
const int32_t* in_ptr7, | |
const int32_t* in_ptr8, | |
const int32_t* in_ptr9, | |
const int32_t* in_ptr10, | |
const int32_t* in_ptr11, | |
const int32_t* in_ptr12, | |
const bfloat16* in_ptr13, | |
int32_t* out_ptr0, | |
int32_t* out_ptr1, | |
int32_t* out_ptr2, | |
int32_t* out_ptr3, | |
int32_t* out_ptr4, | |
int32_t* out_ptr5, | |
int32_t* out_ptr6, | |
int32_t* out_ptr7, | |
int32_t* out_ptr8, | |
int32_t* out_ptr9, | |
int32_t* out_ptr10, | |
int32_t* out_ptr11, | |
int64_t* out_ptr12, | |
bfloat16* out_ptr13, | |
bfloat16* out_ptr14, | |
bfloat16* out_ptr15, | |
bfloat16* out_ptr16, | |
bfloat16* out_ptr17, | |
bfloat16* out_ptr18, | |
bfloat16* out_ptr19, | |
bfloat16* out_ptr20) | |
{ | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr0 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr0 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr0[static_cast<long>(x0)]; | |
out_ptr0[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr1 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr1 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr1[static_cast<long>(x0)]; | |
out_ptr1[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr2 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr2 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr2[static_cast<long>(x0)]; | |
out_ptr2[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr3 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr3 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr3[static_cast<long>(x0)]; | |
out_ptr3[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr4 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr4 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr4[static_cast<long>(x0)]; | |
out_ptr4[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr5 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr5 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr5[static_cast<long>(x0)]; | |
out_ptr5[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr6 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr6 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr6[static_cast<long>(x0)]; | |
out_ptr6[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr7 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr7 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr7[static_cast<long>(x0)]; | |
out_ptr7[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr8 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr8 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr8[static_cast<long>(x0)]; | |
out_ptr8[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr9 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr9 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr9[static_cast<long>(x0)]; | |
out_ptr9[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr10 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr10 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr10[static_cast<long>(x0)]; | |
out_ptr10[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr11 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr11 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr11[static_cast<long>(x0)]; | |
out_ptr11[static_cast<long>(x0)] = tmp0; | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(384L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr12 + static_cast<long>(x0), 16); | |
auto tmp1 = at::vec::convert<int64_t,2,int32_t,1>(tmp0); | |
tmp1.store(out_ptr12 + static_cast<long>(x0), 16); | |
} | |
#pragma omp simd simdlen(8) | |
for(long x0=static_cast<long>(384L); x0<static_cast<long>(396L); x0+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr12[static_cast<long>(x0)]; | |
auto tmp1 = c10::convert<int64_t>(tmp0); | |
out_ptr12[static_cast<long>(x0)] = tmp1; | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr13 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
tmp0.store(out_ptr14 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr15 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr16 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr17 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
tmp0.store(out_ptr18 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))]; | |
auto tmp13 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))]; | |
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L)); | |
auto tmp2 = c10::convert<int64_t>(tmp1); | |
auto tmp3 = decltype(tmp0)(tmp0 + tmp2); | |
auto tmp4 = 156L; | |
auto tmp5 = c10::convert<int64_t>(tmp4); | |
auto tmp6 = decltype(tmp3)(tmp3 + tmp5); | |
auto tmp7 = tmp3 < 0; | |
auto tmp8 = tmp7 ? tmp6 : tmp3; | |
auto tmp9 = tmp8; | |
auto tmp10 = c10::convert<int64_t>(tmp9); | |
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L"); | |
auto tmp12 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))]; | |
auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L)); | |
auto tmp15 = c10::convert<int64_t>(tmp14); | |
auto tmp16 = decltype(tmp13)(tmp13 + tmp15); | |
auto tmp17 = decltype(tmp16)(tmp16 + tmp5); | |
auto tmp18 = tmp16 < 0; | |
auto tmp19 = tmp18 ? tmp17 : tmp16; | |
auto tmp20 = tmp19; | |
auto tmp21 = c10::convert<int64_t>(tmp20); | |
TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L"); | |
auto tmp23 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))]; | |
out_ptr19[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12; | |
out_ptr20[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23; | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const float* in_ptr0, | |
const float* in_ptr1, | |
const int64_t* in_ptr2, | |
const bfloat16* in_ptr3, | |
const float* in_ptr4, | |
const float* in_ptr5, | |
const bfloat16* in_ptr6, | |
float* out_ptr0, | |
float* out_ptr1, | |
float* out_ptr2, | |
float* out_ptr3, | |
float* out_ptr4, | |
float* out_ptr5, | |
float* out_ptr6, | |
float* out_ptr7, | |
float* out_ptr8, | |
bfloat16* out_ptr9, | |
bfloat16* out_ptr10, | |
bfloat16* out_ptr11, | |
bfloat16* out_ptr12, | |
bfloat16* out_ptr13, | |
bfloat16* out_ptr14, | |
bfloat16* out_ptr15, | |
bfloat16* out_ptr16, | |
bfloat16* out_ptr17) | |
{ | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr0 + static_cast<long>(x0)); | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(768L + x0), 16); | |
tmp0.store(out_ptr1 + static_cast<long>(x0)); | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = static_cast<float>(1.0); | |
auto tmp1 = at::vec::Vectorized<float>(tmp0); | |
tmp1.store(out_ptr2 + static_cast<long>(x0)); | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = static_cast<float>(1.0); | |
auto tmp1 = at::vec::Vectorized<float>(tmp0); | |
tmp1.store(out_ptr3 + static_cast<long>(x1 + (448L*x0))); | |
} | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = in_ptr1[static_cast<long>(64L + x1)]; | |
auto tmp1 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x2_inner = 0; x2_inner < 16; x2_inner++) | |
{ | |
tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((x2 + x2_inner + (2112L*x0)), 64L))]; | |
} | |
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
auto tmp16 = in_ptr1[static_cast<long>(704L + x1)]; | |
auto tmp17 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x2_inner = 0; x2_inner < 16; x2_inner++) | |
{ | |
tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((1920L + x2 + x2_inner + (2112L*x0)), 64L))]; | |
} | |
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
auto tmp2 = 13L; | |
auto tmp3 = c10::convert<int64_t>(tmp2); | |
auto tmp4 = at::vec::VectorizedN<int64_t,2>(tmp3); | |
auto tmp5 = tmp1 + tmp4; | |
auto tmp6 = static_cast<int64_t>(0); | |
auto tmp7 = at::vec::VectorizedN<int64_t,2>(tmp6); | |
auto tmp8 = at::vec::VecMask<int64_t,2>(tmp1 < tmp7); | |
auto tmp9 = decltype(tmp5)::blendv(tmp1, tmp5, tmp8.template cast<int64_t,2>()); | |
auto tmp10 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
tmp9.store(tmpbuf.data()); | |
return tmpbuf; | |
} | |
() | |
; | |
auto tmp11 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x2_inner = 0; x2_inner < 16; x2_inner++) | |
{ | |
tmpbuf[x2_inner] = static_cast<long>(tmp10[x2_inner]); | |
} | |
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp11) & (tmp11 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp11 < 13L"); | |
auto tmp13 = | |
[&] | |
{ | |
__at_align__ std::array<float, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x2_inner = 0; x2_inner < 16; x2_inner++) | |
{ | |
tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp10[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))]; | |
} | |
return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
auto tmp14 = at::vec::Vectorized<float>(tmp0); | |
auto tmp15 = tmp14 * tmp13; | |
auto tmp18 = tmp17 + tmp4; | |
auto tmp19 = at::vec::VecMask<int64_t,2>(tmp17 < tmp7); | |
auto tmp20 = decltype(tmp18)::blendv(tmp17, tmp18, tmp19.template cast<int64_t,2>()); | |
auto tmp21 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
tmp20.store(tmpbuf.data()); | |
return tmpbuf; | |
} | |
() | |
; | |
auto tmp22 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x2_inner = 0; x2_inner < 16; x2_inner++) | |
{ | |
tmpbuf[x2_inner] = static_cast<long>(tmp21[x2_inner]); | |
} | |
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp22) & (tmp22 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp22 < 13L"); | |
auto tmp24 = | |
[&] | |
{ | |
__at_align__ std::array<float, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x2_inner = 0; x2_inner < 16; x2_inner++) | |
{ | |
tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp21[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))]; | |
} | |
return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
auto tmp25 = at::vec::Vectorized<float>(tmp16); | |
auto tmp26 = tmp25 * tmp24; | |
tmp15.store(out_ptr4 + static_cast<long>(x2 + (448L*x1) + (28672L*x0))); | |
tmp26.store(out_ptr5 + static_cast<long>(x2 + (448L*x1) + (28672L*x0))); | |
} | |
} | |
} | |
} | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
float tmp_acc0 = -std::numeric_limits<float>::infinity(); | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity()); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16); | |
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp8 = at::vec::minimum(tmp6, tmp7); | |
auto tmp9 = static_cast<float>(1.0); | |
auto tmp10 = at::vec::Vectorized<float>(tmp9); | |
auto tmp11 = tmp10 - tmp8; | |
auto tmp12 = static_cast<float>(-10000.0); | |
auto tmp13 = at::vec::Vectorized<float>(tmp12); | |
auto tmp14 = tmp11 * tmp13; | |
auto tmp15 = tmp5 + tmp14; | |
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15); | |
} | |
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec)); | |
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
{ | |
float tmp_acc0 = 0; | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16); | |
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp16 = out_ptr6[static_cast<long>(x0)]; | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp8 = at::vec::minimum(tmp6, tmp7); | |
auto tmp9 = static_cast<float>(1.0); | |
auto tmp10 = at::vec::Vectorized<float>(tmp9); | |
auto tmp11 = tmp10 - tmp8; | |
auto tmp12 = static_cast<float>(-10000.0); | |
auto tmp13 = at::vec::Vectorized<float>(tmp12); | |
auto tmp14 = tmp11 * tmp13; | |
auto tmp15 = tmp5 + tmp14; | |
auto tmp17 = at::vec::Vectorized<float>(tmp16); | |
auto tmp18 = tmp15 - tmp17; | |
auto tmp19 = tmp18.exp(); | |
tmp19.store(out_ptr7 + static_cast<long>(x1 + (448L*x0))); | |
tmp_acc0_vec = tmp_acc0_vec + tmp19; | |
} | |
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec); | |
out_ptr8[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr7 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp1 = out_ptr8[static_cast<long>(x0)]; | |
auto tmp2 = at::vec::Vectorized<float>(tmp1); | |
auto tmp3 = tmp0 / tmp2; | |
auto tmp4 = at::vec::convert<bfloat16>(tmp3); | |
tmp4.store(out_ptr9 + static_cast<long>(x1 + (448L*x0)), 16); | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr10 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
tmp0.store(out_ptr11 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr12 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr13 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr14 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
tmp0.store(out_ptr15 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))]; | |
auto tmp13 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))]; | |
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L)); | |
auto tmp2 = c10::convert<int64_t>(tmp1); | |
auto tmp3 = decltype(tmp0)(tmp0 + tmp2); | |
auto tmp4 = 156L; | |
auto tmp5 = c10::convert<int64_t>(tmp4); | |
auto tmp6 = decltype(tmp3)(tmp3 + tmp5); | |
auto tmp7 = tmp3 < 0; | |
auto tmp8 = tmp7 ? tmp6 : tmp3; | |
auto tmp9 = tmp8; | |
auto tmp10 = c10::convert<int64_t>(tmp9); | |
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L"); | |
auto tmp12 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))]; | |
auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L)); | |
auto tmp15 = c10::convert<int64_t>(tmp14); | |
auto tmp16 = decltype(tmp13)(tmp13 + tmp15); | |
auto tmp17 = decltype(tmp16)(tmp16 + tmp5); | |
auto tmp18 = tmp16 < 0; | |
auto tmp19 = tmp18 ? tmp17 : tmp16; | |
auto tmp20 = tmp19; | |
auto tmp21 = c10::convert<int64_t>(tmp20); | |
TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L"); | |
auto tmp23 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))]; | |
out_ptr16[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12; | |
out_ptr17[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23; | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused_cat_clone_3 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
const bfloat16* in_ptr1, | |
bfloat16* out_ptr0, | |
bfloat16* out_ptr1, | |
bfloat16* out_ptr2, | |
bfloat16* out_ptr3) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); | |
tmp0.store(out_ptr0 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); | |
tmp0.store(out_ptr1 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); | |
tmp0.store(out_ptr2 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr3 + static_cast<long>(x2 + (64L*x1) + (36864L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused_clone_4 = async_compile.cpp_pybinding(['const int64_t*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const int64_t* in_ptr0, | |
const bfloat16* in_ptr1, | |
const bfloat16* in_ptr2, | |
bfloat16* out_ptr0, | |
bfloat16* out_ptr1) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(1L)) | |
{ | |
auto tmp0 = in_ptr0[static_cast<long>((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L)))]; | |
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L))), 33L)); | |
auto tmp2 = c10::convert<int64_t>(tmp1); | |
auto tmp3 = decltype(tmp0)(tmp0 + tmp2); | |
auto tmp4 = 156L; | |
auto tmp5 = c10::convert<int64_t>(tmp4); | |
auto tmp6 = decltype(tmp3)(tmp3 + tmp5); | |
auto tmp7 = tmp3 < 0; | |
auto tmp8 = tmp7 ? tmp6 : tmp3; | |
auto tmp9 = tmp8; | |
auto tmp10 = c10::convert<int64_t>(tmp9); | |
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L"); | |
auto tmp12 = in_ptr1[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))]; | |
auto tmp13 = in_ptr2[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))]; | |
out_ptr0[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp12; | |
out_ptr1[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp13; | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused__softmax__to_copy_add_cat_mul_rsub_5 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
const float* in_ptr1, | |
const bfloat16* in_ptr2, | |
const float* in_ptr3, | |
const bfloat16* in_ptr4, | |
const float* in_ptr5, | |
const int64_t* in_ptr6, | |
const bfloat16* in_ptr7, | |
const bfloat16* in_ptr8, | |
const bfloat16* in_ptr9, | |
bfloat16* out_ptr0, | |
bfloat16* out_ptr1, | |
bfloat16* out_ptr2, | |
bfloat16* out_ptr3, | |
float* out_ptr4, | |
float* out_ptr5, | |
float* out_ptr6, | |
bfloat16* out_ptr7, | |
bfloat16* out_ptr8, | |
bfloat16* out_ptr9, | |
bfloat16* out_ptr10) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (64L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp7 = static_cast<float>(1.0); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp8 - tmp6; | |
auto tmp10 = static_cast<float>(-10000.0); | |
auto tmp11 = at::vec::Vectorized<float>(tmp10); | |
auto tmp12 = tmp9 * tmp11; | |
auto tmp13 = tmp5 + tmp12; | |
auto tmp14 = at::vec::convert<bfloat16>(tmp13); | |
tmp14.store(out_ptr0 + static_cast<long>(x1 + (512L*x0)), 16); | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x2 + (192L*x1) + (110592L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x2 + (192L*x1)), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp7 = static_cast<float>(1.0); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp8 - tmp6; | |
auto tmp10 = static_cast<float>(-10000.0); | |
auto tmp11 = at::vec::Vectorized<float>(tmp10); | |
auto tmp12 = tmp9 * tmp11; | |
auto tmp13 = tmp5 + tmp12; | |
auto tmp14 = at::vec::convert<bfloat16>(tmp13); | |
tmp14.store(out_ptr1 + static_cast<long>(x2 + (512L*x1) + (294912L*x0)), 16); | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(192L); x3+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x3 + (192L*x2) + (12288L*x1) + (110592L*x0)), 16); | |
auto tmp6 = in_ptr5[static_cast<long>(128L + x2 + (64L*x1))]; | |
auto tmp7 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x3_inner = 0; x3_inner < 16; x3_inner++) | |
{ | |
tmpbuf[x3_inner] = in_ptr6[static_cast<long>(c10::div_floor_integer((192L + x3 + x3_inner + (192L*x1) + (2112L*x0)), 64L))]; | |
} | |
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp8 = 13L; | |
auto tmp9 = c10::convert<int64_t>(tmp8); | |
auto tmp10 = at::vec::VectorizedN<int64_t,2>(tmp9); | |
auto tmp11 = tmp7 + tmp10; | |
auto tmp12 = static_cast<int64_t>(0); | |
auto tmp13 = at::vec::VectorizedN<int64_t,2>(tmp12); | |
auto tmp14 = at::vec::VecMask<int64_t,2>(tmp7 < tmp13); | |
auto tmp15 = decltype(tmp11)::blendv(tmp7, tmp11, tmp14.template cast<int64_t,2>()); | |
auto tmp16 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
tmp15.store(tmpbuf.data()); | |
return tmpbuf; | |
} | |
() | |
; | |
auto tmp17 = | |
[&] | |
{ | |
__at_align__ std::array<int64_t, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x3_inner = 0; x3_inner < 16; x3_inner++) | |
{ | |
tmpbuf[x3_inner] = static_cast<long>(tmp16[x3_inner]); | |
} | |
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp17) & (tmp17 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp17 < 13L"); | |
auto tmp19 = | |
[&] | |
{ | |
__at_align__ std::array<float, 16> tmpbuf; | |
#pragma GCC unroll 16 | |
for (long x3_inner = 0; x3_inner < 16; x3_inner++) | |
{ | |
tmpbuf[x3_inner] = in_ptr5[static_cast<long>((64L*tmp16[x3_inner]) + (static_cast<long>((x3 + x3_inner)) % static_cast<long>(64L)))]; | |
} | |
return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16); | |
} | |
() | |
; | |
auto tmp20 = at::vec::Vectorized<float>(tmp6); | |
auto tmp21 = tmp20 * tmp19; | |
auto tmp22 = static_cast<float>(1.0); | |
auto tmp23 = at::vec::Vectorized<float>(tmp22); | |
auto tmp24 = tmp23 - tmp21; | |
auto tmp25 = static_cast<float>(-10000.0); | |
auto tmp26 = at::vec::Vectorized<float>(tmp25); | |
auto tmp27 = tmp24 * tmp26; | |
auto tmp28 = tmp5 + tmp27; | |
auto tmp29 = at::vec::convert<bfloat16>(tmp28); | |
tmp29.store(out_ptr2 + static_cast<long>(x3 + (512L*x2) + (32768L*x1) + (294912L*x0)), 16); | |
} | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (64L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(768L + x1), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp7 = static_cast<float>(1.0); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp8 - tmp6; | |
auto tmp10 = static_cast<float>(-10000.0); | |
auto tmp11 = at::vec::Vectorized<float>(tmp10); | |
auto tmp12 = tmp9 * tmp11; | |
auto tmp13 = tmp5 + tmp12; | |
auto tmp14 = at::vec::convert<bfloat16>(tmp13); | |
tmp14.store(out_ptr3 + static_cast<long>(x1 + (512L*x0)), 16); | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
float tmp_acc0 = -std::numeric_limits<float>::infinity(); | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity()); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp1); | |
} | |
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec)); | |
out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
{ | |
float tmp_acc0 = 0; | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16); | |
auto tmp2 = out_ptr4[static_cast<long>(x0)]; | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 - tmp3; | |
auto tmp5 = tmp4.exp(); | |
tmp5.store(out_ptr5 + static_cast<long>(x1 + (512L*x0))); | |
tmp_acc0_vec = tmp_acc0_vec + tmp5; | |
} | |
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec); | |
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (512L*x0)), 16); | |
auto tmp1 = out_ptr6[static_cast<long>(x0)]; | |
auto tmp2 = at::vec::Vectorized<float>(tmp1); | |
auto tmp3 = tmp0 / tmp2; | |
auto tmp4 = at::vec::convert<bfloat16>(tmp3); | |
tmp4.store(out_ptr7 + static_cast<long>(x1 + (512L*x0)), 16); | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); | |
tmp0.store(out_ptr8 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); | |
tmp0.store(out_ptr9 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L)) | |
{ | |
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); | |
tmp0.store(out_ptr10 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused_cat_6 = async_compile.cpp_pybinding(['const bfloat16*', 'bfloat16*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
bfloat16* out_ptr0, | |
bfloat16* out_ptr1) | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr0 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr1 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused__softmax_add_cat_minimum_mul_rsub_7 = async_compile.cpp_pybinding(['const float*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const float* in_ptr0, | |
const bfloat16* in_ptr1, | |
const float* in_ptr2, | |
const float* in_ptr3, | |
const bfloat16* in_ptr4, | |
float* out_ptr0, | |
float* out_ptr1, | |
float* out_ptr2, | |
float* out_ptr3, | |
float* out_ptr4, | |
float* out_ptr5, | |
float* out_ptr6, | |
bfloat16* out_ptr7, | |
bfloat16* out_ptr8, | |
bfloat16* out_ptr9) | |
{ | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); | |
tmp0.store(out_ptr0 + static_cast<long>(x0)); | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(640L + x0), 16); | |
tmp0.store(out_ptr1 + static_cast<long>(x0)); | |
} | |
} | |
{ | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = static_cast<float>(1.0); | |
auto tmp1 = at::vec::Vectorized<float>(tmp0); | |
tmp1.store(out_ptr2 + static_cast<long>(x0)); | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = static_cast<float>(1.0); | |
auto tmp1 = at::vec::Vectorized<float>(tmp0); | |
tmp1.store(out_ptr3 + static_cast<long>(x1 + (448L*x0))); | |
} | |
} | |
} | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
float tmp_acc0 = -std::numeric_limits<float>::infinity(); | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity()); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16); | |
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp8 = at::vec::minimum(tmp6, tmp7); | |
auto tmp9 = static_cast<float>(1.0); | |
auto tmp10 = at::vec::Vectorized<float>(tmp9); | |
auto tmp11 = tmp10 - tmp8; | |
auto tmp12 = static_cast<float>(-10000.0); | |
auto tmp13 = at::vec::Vectorized<float>(tmp12); | |
auto tmp14 = tmp11 * tmp13; | |
auto tmp15 = tmp5 + tmp14; | |
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15); | |
} | |
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec)); | |
out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
{ | |
float tmp_acc0 = 0; | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16); | |
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp16 = out_ptr4[static_cast<long>(x0)]; | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp8 = at::vec::minimum(tmp6, tmp7); | |
auto tmp9 = static_cast<float>(1.0); | |
auto tmp10 = at::vec::Vectorized<float>(tmp9); | |
auto tmp11 = tmp10 - tmp8; | |
auto tmp12 = static_cast<float>(-10000.0); | |
auto tmp13 = at::vec::Vectorized<float>(tmp12); | |
auto tmp14 = tmp11 * tmp13; | |
auto tmp15 = tmp5 + tmp14; | |
auto tmp17 = at::vec::Vectorized<float>(tmp16); | |
auto tmp18 = tmp15 - tmp17; | |
auto tmp19 = tmp18.exp(); | |
tmp19.store(out_ptr5 + static_cast<long>(x1 + (448L*x0))); | |
tmp_acc0_vec = tmp_acc0_vec + tmp19; | |
} | |
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec); | |
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (448L*x0)), 16); | |
auto tmp1 = out_ptr6[static_cast<long>(x0)]; | |
auto tmp2 = at::vec::Vectorized<float>(tmp1); | |
auto tmp3 = tmp0 / tmp2; | |
auto tmp4 = at::vec::convert<bfloat16>(tmp3); | |
tmp4.store(out_ptr7 + static_cast<long>(x1 + (448L*x0)), 16); | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr8 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32); | |
tmp0.store(out_ptr9 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused__softmax_add_mul_rsub_8 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
const float* in_ptr1, | |
float* out_ptr0, | |
float* out_ptr1, | |
float* out_ptr2, | |
bfloat16* out_ptr3) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
float tmp_acc0 = -std::numeric_limits<float>::infinity(); | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity()); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp7 = static_cast<float>(1.0); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp8 - tmp6; | |
auto tmp10 = static_cast<float>(-10000.0); | |
auto tmp11 = at::vec::Vectorized<float>(tmp10); | |
auto tmp12 = tmp9 * tmp11; | |
auto tmp13 = tmp5 + tmp12; | |
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13); | |
} | |
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec)); | |
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
{ | |
float tmp_acc0 = 0; | |
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16); | |
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16); | |
auto tmp14 = out_ptr0[static_cast<long>(x0)]; | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.125); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = (tmp4); | |
auto tmp7 = static_cast<float>(1.0); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp8 - tmp6; | |
auto tmp10 = static_cast<float>(-10000.0); | |
auto tmp11 = at::vec::Vectorized<float>(tmp10); | |
auto tmp12 = tmp9 * tmp11; | |
auto tmp13 = tmp5 + tmp12; | |
auto tmp15 = at::vec::Vectorized<float>(tmp14); | |
auto tmp16 = tmp13 - tmp15; | |
auto tmp17 = tmp16.exp(); | |
tmp17.store(out_ptr1 + static_cast<long>(x1 + (832L*x0))); | |
tmp_acc0_vec = tmp_acc0_vec + tmp17; | |
} | |
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec); | |
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16); | |
auto tmp1 = out_ptr2[static_cast<long>(x0)]; | |
auto tmp2 = at::vec::Vectorized<float>(tmp1); | |
auto tmp3 = tmp0 / tmp2; | |
auto tmp4 = at::vec::convert<bfloat16>(tmp3); | |
tmp4.store(out_ptr3 + static_cast<long>(x1 + (832L*x0)), 16); | |
} | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused_cat_mul_9 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
const bfloat16* in_ptr1, | |
const bfloat16* in_ptr2, | |
const bfloat16* in_ptr3, | |
const bfloat16* in_ptr4, | |
const bfloat16* in_ptr5, | |
const bfloat16* in_ptr6, | |
const bfloat16* in_ptr7, | |
const bfloat16* in_ptr8, | |
const float* in_ptr9, | |
bfloat16* out_ptr0, | |
bfloat16* out_ptr1, | |
bfloat16* out_ptr2, | |
bfloat16* out_ptr3, | |
bfloat16* out_ptr4, | |
float* out_ptr5) | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (4096L*x0)), 32); | |
tmp0.store(out_ptr0 + static_cast<long>(x1 + (53248L*x0)), 32); | |
} | |
} | |
} | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (4096L*x0)), 32); | |
tmp0.store(out_ptr1 + static_cast<long>(x1 + (53248L*x0)), 32); | |
} | |
} | |
} | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(36864L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x1 + (36864L*x0)), 16); | |
auto tmp2 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (36864L*x0)), 16); | |
auto tmp5 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x1 + (36864L*x0)), 16); | |
auto tmp8 = at::vec::Vectorized<bfloat16>::loadu(in_ptr5 + static_cast<long>(x1 + (36864L*x0)), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp3 = at::vec::convert<float>(tmp2); | |
auto tmp4 = tmp1 + tmp3; | |
auto tmp6 = at::vec::convert<float>(tmp5); | |
auto tmp7 = tmp4 + tmp6; | |
auto tmp9 = at::vec::convert<float>(tmp8); | |
auto tmp10 = tmp7 + tmp9; | |
auto tmp11 = at::vec::convert<bfloat16>(tmp10); | |
tmp11.store(out_ptr2 + static_cast<long>(x1 + (53248L*x0)), 16); | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x1 + (4096L*x0)), 32); | |
tmp0.store(out_ptr3 + static_cast<long>(x1 + (53248L*x0)), 32); | |
} | |
} | |
} | |
} | |
#pragma omp single | |
{ | |
{ | |
#pragma GCC ivdep | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (4096L*x0)), 32); | |
tmp0.store(out_ptr4 + static_cast<long>(x1 + (53248L*x0)), 32); | |
} | |
} | |
} | |
} | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x2 + (64L*x1) + (53248L*x0)), 16); | |
auto tmp2 = in_ptr9[static_cast<long>(x1)]; | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
tmp4.store(out_ptr5 + static_cast<long>(x2 + (64L*x1) + (53248L*x0))); | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1 = args | |
args.clear() | |
assert_size_stride(arg0_1, (11, 3), (3, 1)) | |
assert_size_stride(arg1_1, (11, 3), (3, 1)) | |
assert_size_stride(arg2_1, (11, 3), (3, 1)) | |
assert_size_stride(arg3_1, (11, 3), (3, 1)) | |
assert_size_stride(arg4_1, (11, 3), (3, 1)) | |
assert_size_stride(arg5_1, (11, 3), (3, 1)) | |
assert_size_stride(arg6_1, (11, 3), (3, 1)) | |
assert_size_stride(arg7_1, (11, 3), (3, 1)) | |
assert_size_stride(arg8_1, (11, 3), (3, 1)) | |
assert_size_stride(arg9_1, (11, 3), (3, 1)) | |
assert_size_stride(arg10_1, (11, 3), (3, 1)) | |
assert_size_stride(arg11_1, (11, 3), (3, 1)) | |
assert_size_stride(arg12_1, (1, 12, 832, 64), (638976, 64, 768, 1)) | |
assert_size_stride(arg13_1, (1, 13, 64), (832, 64, 1)) | |
assert_size_stride(arg14_1, (1, 12, 832, 64), (638976, 64, 768, 1)) | |
assert_size_stride(arg15_1, (1, 12, 832, 64), (638976, 64, 768, 1)) | |
assert_size_stride(arg16_1, (1, 1, 1, 832), (832, 832, 832, 1)) | |
assert_size_stride(arg17_1, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1)) | |
assert_size_stride(arg18_1, (1, 1, 832, 1), (832, 832, 1, 1)) | |
buf0 = empty_strided_cpu((12, 64, 832), (53248, 832, 1), torch.bfloat16) | |
# Source Nodes: [bmm], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 0), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf0) | |
buf1 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32) | |
buf2 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.float32) | |
buf3 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32) | |
buf4 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.bfloat16) | |
cpp_fused__softmax_add_mul_rsub_0(buf0, arg16_1, buf1, buf2, buf3, buf4) | |
buf5 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16) | |
# Source Nodes: [bmm_1], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf5) | |
buf18 = empty_strided_cpu((132, 3), (3, 1), torch.int32) | |
buf6 = reinterpret_tensor(buf18, (11, 3), (3, 1), 0) # alias | |
buf7 = reinterpret_tensor(buf18, (11, 3), (3, 1), 33) # alias | |
buf8 = reinterpret_tensor(buf18, (11, 3), (3, 1), 66) # alias | |
buf9 = reinterpret_tensor(buf18, (11, 3), (3, 1), 99) # alias | |
buf10 = reinterpret_tensor(buf18, (11, 3), (3, 1), 132) # alias | |
buf11 = reinterpret_tensor(buf18, (11, 3), (3, 1), 165) # alias | |
buf12 = reinterpret_tensor(buf18, (11, 3), (3, 1), 198) # alias | |
buf13 = reinterpret_tensor(buf18, (11, 3), (3, 1), 231) # alias | |
buf14 = reinterpret_tensor(buf18, (11, 3), (3, 1), 264) # alias | |
buf15 = reinterpret_tensor(buf18, (11, 3), (3, 1), 297) # alias | |
buf16 = reinterpret_tensor(buf18, (11, 3), (3, 1), 330) # alias | |
buf17 = reinterpret_tensor(buf18, (11, 3), (3, 1), 363) # alias | |
buf19 = empty_strided_cpu((12, 11, 3), (33, 3, 1), torch.int64) | |
buf25 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16) | |
buf20 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias | |
buf78 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16) | |
buf73 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias | |
buf21 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias | |
buf22 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias | |
buf23 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias | |
buf76 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias | |
buf24 = reinterpret_tensor(buf25, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias | |
buf77 = reinterpret_tensor(buf78, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias | |
cpp_fused__to_copy_cat_stack_1(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, buf18, arg14_1, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf19, buf20, buf73, buf21, buf22, buf23, buf76, buf24, buf77) | |
del arg0_1 | |
del arg10_1 | |
del arg11_1 | |
del arg1_1 | |
del arg2_1 | |
del arg3_1 | |
del arg4_1 | |
del arg5_1 | |
del arg6_1 | |
del arg7_1 | |
del arg8_1 | |
del arg9_1 | |
del buf10 | |
del buf11 | |
del buf12 | |
del buf13 | |
del buf14 | |
del buf15 | |
del buf16 | |
del buf17 | |
del buf18 | |
del buf20 | |
del buf21 | |
del buf22 | |
del buf23 | |
del buf24 | |
del buf6 | |
del buf7 | |
del buf8 | |
del buf9 | |
buf26 = empty_strided_cpu((12, 64, 448), (28672, 448, 1), torch.bfloat16) | |
# Source Nodes: [bmm_2], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 49152), reinterpret_tensor(buf25, (12, 64, 448), (28672, 1, 64), 0), out=buf26) | |
buf30 = empty_strided_cpu((1, 1, 1, 448), (448, 448, 448, 1), torch.float32) | |
buf27 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 0) # alias | |
buf28 = reinterpret_tensor(buf30, (1, 1, 1, 64), (448, 448, 448, 1), 192) # alias | |
buf29 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 256) # alias | |
buf33 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32) | |
buf31 = reinterpret_tensor(buf33, (1, 12, 64, 256), (344064, 28672, 448, 1), 0) # alias | |
buf32 = reinterpret_tensor(buf33, (1, 12, 64, 192), (344064, 28672, 448, 1), 256) # alias | |
buf86 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32) | |
buf85 = reinterpret_tensor(buf86, (1, 12, 64, 192), (344064, 28672, 448, 1), 256) # alias | |
buf34 = buf3; del buf3 # reuse | |
buf35 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32) | |
buf36 = buf1; del buf1 # reuse | |
buf43 = reinterpret_tensor(buf25, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf25 # reuse | |
buf42 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16) | |
buf37 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias | |
buf95 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16) | |
buf90 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias | |
buf38 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias | |
buf39 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias | |
buf40 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias | |
buf93 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias | |
buf41 = reinterpret_tensor(buf42, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias | |
buf94 = reinterpret_tensor(buf95, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias | |
cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2(arg16_1, arg13_1, buf19, buf26, buf30, buf33, arg15_1, buf27, buf28, buf29, buf31, buf32, buf85, buf34, buf35, buf36, buf43, buf37, buf90, buf38, buf39, buf40, buf93, buf41, buf94) | |
del buf26 | |
del buf27 | |
del buf28 | |
del buf29 | |
del buf31 | |
del buf32 | |
del buf33 | |
del buf37 | |
del buf38 | |
del buf39 | |
del buf40 | |
del buf41 | |
buf44 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16) | |
# Source Nodes: [bmm_3], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf42, (12, 448, 64), (28672, 64, 1), 0), out=buf44) | |
del buf42 | |
buf45 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16) | |
# Source Nodes: [first_band_product], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 0), out=buf45) | |
buf49 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16) | |
buf46 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0) # alias | |
buf47 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096) # alias | |
buf48 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192) # alias | |
buf50 = empty_strided_cpu((1, 12, 9, 64, 64), (442368, 36864, 4096, 64, 1), torch.bfloat16) | |
cpp_fused_cat_clone_3(arg14_1, arg12_1, buf46, buf47, buf48, buf50) | |
del buf46 | |
del buf47 | |
del buf48 | |
buf51 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16) | |
# Source Nodes: [bmm_4], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf49, (108, 64, 192), (12288, 1, 64), 0), out=buf51) | |
buf52 = buf49; del buf49 # reuse | |
buf69 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16) | |
cpp_fused_clone_4(buf19, arg14_1, arg15_1, buf52, buf69) | |
buf53 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16) | |
# Source Nodes: [bmm_5], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf52, (108, 64, 192), (12288, 1, 64), 0), out=buf53) | |
buf54 = reinterpret_tensor(buf50, (12, 576, 64), (36864, 64, 1), 0); del buf50 # reuse | |
# Source Nodes: [last_band_product], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 589824), out=buf54) | |
buf59 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16) | |
buf55 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 0) # alias | |
buf56 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 64) # alias | |
buf57 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 256) # alias | |
buf58 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 448) # alias | |
buf60 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32) | |
buf61 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.float32) | |
buf62 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32) | |
buf67 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16) | |
buf66 = buf52; del buf52 # reuse | |
buf63 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0) # alias | |
buf64 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096) # alias | |
buf65 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192) # alias | |
cpp_fused__softmax__to_copy_add_cat_mul_rsub_5(buf45, arg16_1, buf51, arg17_1, buf53, arg13_1, buf19, buf54, buf59, arg15_1, buf55, buf56, buf57, buf58, buf60, buf61, buf62, buf67, buf63, buf64, buf65) | |
del arg13_1 | |
del arg17_1 | |
del buf51 | |
del buf53 | |
del buf55 | |
del buf56 | |
del buf57 | |
del buf58 | |
del buf59 | |
del buf60 | |
del buf61 | |
del buf62 | |
del buf63 | |
del buf64 | |
del buf65 | |
buf68 = reinterpret_tensor(buf54, (108, 64, 64), (4096, 64, 1), 0); del buf54 # reuse | |
# Source Nodes: [bmm_6], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 64), reinterpret_tensor(buf66, (108, 192, 64), (12288, 64, 1), 0), out=buf68) | |
del buf66 | |
buf70 = reinterpret_tensor(buf45, (108, 64, 64), (4096, 64, 1), 0); del buf45 # reuse | |
# Source Nodes: [bmm_7], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 256), reinterpret_tensor(buf69, (108, 192, 64), (12288, 64, 1), 0), out=buf70) | |
del buf69 | |
buf71 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16) | |
# Source Nodes: [einsum_3], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 0), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 0), out=buf71) | |
buf72 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16) | |
# Source Nodes: [einsum_4], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 448), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 589824), out=buf72) | |
del buf67 | |
buf74 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias | |
buf75 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias | |
cpp_fused_cat_6(arg14_1, buf74, buf75) | |
del buf73 | |
del buf74 | |
del buf75 | |
del buf76 | |
del buf77 | |
buf79 = reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0); del buf43 # reuse | |
# Source Nodes: [bmm_8], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 540672), reinterpret_tensor(buf78, (12, 64, 448), (28672, 1, 64), 0), out=buf79) | |
buf83 = buf30; del buf30 # reuse | |
buf80 = reinterpret_tensor(buf83, (1, 1, 1, 64), (448, 448, 448, 1), 0) # alias | |
buf81 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 64) # alias | |
buf82 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 256) # alias | |
buf84 = reinterpret_tensor(buf86, (1, 12, 64, 256), (344064, 28672, 448, 1), 0) # alias | |
buf87 = buf36; del buf36 # reuse | |
buf88 = buf35; del buf35 # reuse | |
buf89 = buf34; del buf34 # reuse | |
buf96 = reinterpret_tensor(buf78, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf78 # reuse | |
buf91 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias | |
buf92 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias | |
cpp_fused__softmax_add_cat_minimum_mul_rsub_7(arg16_1, buf79, buf83, buf86, arg15_1, buf80, buf81, buf82, buf84, buf87, buf88, buf89, buf96, buf91, buf92) | |
del buf79 | |
del buf80 | |
del buf81 | |
del buf82 | |
del buf83 | |
del buf84 | |
del buf85 | |
del buf86 | |
del buf88 | |
del buf90 | |
del buf91 | |
del buf92 | |
del buf93 | |
del buf94 | |
buf97 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16) | |
# Source Nodes: [bmm_9], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf96, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf95, (12, 448, 64), (28672, 64, 1), 0), out=buf97) | |
del buf95 | |
del buf96 | |
buf98 = reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0); del buf4 # reuse | |
# Source Nodes: [bmm_10], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 589824), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf98) | |
del arg12_1 | |
del arg14_1 | |
buf99 = buf89; del buf89 # reuse | |
buf100 = buf2; del buf2 # reuse | |
buf101 = buf87; del buf87 # reuse | |
buf102 = reinterpret_tensor(buf0, (1, 12, 64, 832), (638976, 53248, 832, 1), 0); del buf0 # reuse | |
cpp_fused__softmax_add_mul_rsub_8(buf98, arg16_1, buf99, buf100, buf101, buf102) | |
del arg16_1 | |
del buf101 | |
del buf98 | |
del buf99 | |
buf103 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16) | |
# Source Nodes: [bmm_11], Original ATen: [aten.bmm] | |
extern_kernels.bmm(reinterpret_tensor(buf102, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf103) | |
del arg15_1 | |
buf109 = reinterpret_tensor(buf102, (1, 12, 13, 64, 64), (638976, 53248, 4096, 64, 1), 0); del buf102 # reuse | |
buf104 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 0) # alias | |
buf105 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 4096) # alias | |
buf106 = reinterpret_tensor(buf109, (1, 12, 9, 64, 64), (638976, 53248, 4096, 64, 1), 8192) # alias | |
buf107 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 45056) # alias | |
buf108 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 49152) # alias | |
buf110 = reinterpret_tensor(buf100, (1, 12, 832, 64), (638976, 53248, 64, 1), 0); del buf100 # reuse | |
cpp_fused_cat_mul_9(buf5, buf44, buf68, buf70, buf71, buf72, buf97, buf103, buf109, arg18_1, buf104, buf105, buf106, buf107, buf108, buf110) | |
del arg18_1 | |
return (reinterpret_tensor(buf110, (1, 832, 12, 64), (638976, 64, 53248, 1), 0), reinterpret_tensor(buf19, (1, 12, 11, 3), (396, 33, 3, 1), 0), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
arg0_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg1_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg2_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg3_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg4_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg5_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg6_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg7_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg8_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg9_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg10_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg11_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg12_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16) | |
arg13_1 = rand_strided((1, 13, 64), (832, 64, 1), device='cpu', dtype=torch.float32) | |
arg14_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16) | |
arg15_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16) | |
arg16_1 = rand_strided((1, 1, 1, 832), (832, 832, 832, 1), device='cpu', dtype=torch.float32) | |
arg17_1 = rand_strided((1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), device='cpu', dtype=torch.float32) | |
arg18_1 = rand_strided((1, 1, 832, 1), (832, 832, 1, 1), device='cpu', dtype=torch.float32) | |
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:09.567000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:09.568000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "81a28a443bd0d99705f0b5d2b9a46edc"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['bsz'], accessed_by=DictGetItemGuardAccessor(bsz) | |
| | +- EQUALS_MATCH: L['bsz'] == 1 | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads) | |
| | +- EQUALS_MATCH: L['n_heads'] == 12 | |
| +- GuardManager: source=L['rsqrt_d'], accessed_by=DictGetItemGuardAccessor(rsqrt_d) | |
| | +- EQUALS_MATCH: L['rsqrt_d'] == 0.125 | |
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1]) | |
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0) | |
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400) | |
| | +- LENGTH_CHECK: len(L['___stack0']) == 12 | |
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1]) | |
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['key_layer'], accessed_by=DictGetItemGuardAccessor(key_layer) | |
| | +- TENSOR_MATCH: check_tensor(L['key_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['key_layer'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size) | |
| | +- EQUALS_MATCH: L['batch_size'] == 1 | |
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len) | |
| | +- EQUALS_MATCH: L['to_seq_len'] == 832 | |
| +- GuardManager: source=L['query_layer'], accessed_by=DictGetItemGuardAccessor(query_layer) | |
| | +- TENSOR_MATCH: check_tensor(L['query_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['query_layer'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['value_layer'], accessed_by=DictGetItemGuardAccessor(value_layer) | |
| | +- TENSOR_MATCH: check_tensor(L['value_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['value_layer'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len) | |
| | +- EQUALS_MATCH: L['from_seq_len'] == 832 | |
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks) | |
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3 | |
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size) | |
| | +- EQUALS_MATCH: L['to_block_size'] == 64 | |
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size) | |
| | +- EQUALS_MATCH: L['from_block_size'] == 64 | |
| +- GuardManager: source=L['attn_mask_penalty'], accessed_by=DictGetItemGuardAccessor(attn_mask_penalty) | |
| | +- EQUALS_MATCH: L['attn_mask_penalty'] == -10000.0 | |
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn) | |
| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680) | |
| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional) | |
| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024) | |
| | | | +- GuardManager: source=G['nn'].functional.softmax, accessed_by=GetAttrGuardAccessor(softmax) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.softmax, 139842422997488) | |
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np) | |
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488) | |
| | | +- GuardManager: source=G['np'].stack, accessed_by=GetAttrGuardAccessor(stack) | |
| | | | +- ID_MATCH: ___check_obj_id(G['np'].stack, 139844763318256) | |
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch) | |
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800) | |
| | | +- GuardManager: source=G['torch'].bmm, accessed_by=GetAttrGuardAccessor(bmm) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].bmm, 139845228834192) | |
| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672) | |
| | | +- GuardManager: source=G['torch'].div, accessed_by=GetAttrGuardAccessor(div) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].div, 139845228790304) | |
| | | +- GuardManager: source=G['torch'].long, accessed_by=GetAttrGuardAccessor(long) | |
| | | | +- EQUALS_MATCH: G['torch'].long == torch.int64 | |
| | | +- GuardManager: source=G['torch'].stack, accessed_by=GetAttrGuardAccessor(stack) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].stack, 139845228799024) | |
| | | +- GuardManager: source=G['torch'].arange, accessed_by=GetAttrGuardAccessor(arange) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].arange, 139845228706960) | |
| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568) | |
| | | +- GuardManager: source=G['torch'].tensor, accessed_by=GetAttrGuardAccessor(tensor) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].tensor, 139845228703840) | |
| | | +- GuardManager: source=G['torch'].minimum, accessed_by=GetAttrGuardAccessor(minimum) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].minimum, 139845228824272) | |
| | | +- GuardManager: source=G['torch'].transpose, accessed_by=GetAttrGuardAccessor(transpose) | |
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].transpose, 139845228736688) | |
| | +- GuardManager: source=G['__builtins_dict___46'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___46) | |
| | | +- GuardManager: source=G['__builtins_dict___46']['len'], accessed_by=DictGetItemGuardAccessor(len) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['len'], 139845257826832) | |
| | | +- GuardManager: source=G['__builtins_dict___46']['zip'], accessed_by=DictGetItemGuardAccessor(zip) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['zip'], 7491872) | |
| | | +- GuardManager: source=G['__builtins_dict___46']['range'], accessed_by=DictGetItemGuardAccessor(range) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['range'], 7632448) | |
| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336) | |
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask) | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
V0627 17:31:09.568000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "15/0", "frame_key": "20", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 583, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 58, "shape_env_guard_count": 0, "graph_op_count": 208, "graph_node_count": 228, "graph_input_count": 19, "start_time": 1719534664.260442, "entire_frame_compile_time_s": 5.308261871337891, "backend_compile_time_s": 5.101780414581299, "inductor_compile_time_s": 4.007972240447998, "code_gen_time_s": 3.5389716625213623, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.580000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 40, "size": 2555904}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 12, 64], "is_leaf": true, "stride": [638976, 64, 53248, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dbd80>", "describer_id": 40}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 40, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.587000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 12, 64], "contiguous": [1, 832, 12, 64], "context_layer": [1, 832, 768]}}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "153b3dc8bb7ea7326b02a24531cf2b23"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_0_: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"): | |
l_stack0_0_ = L_stack0_0_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) | |
contiguous: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l_stack0_0_.contiguous(); l_stack0_0_ = None | |
context_layer: "f32[1, 832, 768][638976, 768, 1]cpu" = contiguous.view(1, 832, -1); contiguous = None | |
return (context_layer,) | |
V0627 17:31:09.599000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "4088b7608c41845b848a0fa539961d1e"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) | |
clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format); arg0_1 = None | |
view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(clone, [1, 832, -1]); clone = None | |
return (view,) | |
V0627 17:31:09.609000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "33da1fe849e643eaf3458df62aaeea7e"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) | |
clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format); arg0_1 = None | |
view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.reshape.default(clone, [1, 832, -1]); clone = None | |
return (view,) | |
V0627 17:31:09.703000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/7l/c7lltvlss5l4w5dsp4k3kpmjg6nemqpgb5mrjqqw2csgjbuvtav3.py"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "675b3bf5875d915c125bff4b02eb31f4"} | |
# AOT ID: ['9_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
cpp_fused_clone_0 = async_compile.cpp_pybinding(['const float*', 'float*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const float* in_ptr0, | |
float* out_ptr0) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L)) | |
{ | |
#pragma GCC ivdep | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(12L); x1+=static_cast<long>(1L)) | |
{ | |
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x2 + (64L*x0) + (53248L*x1)), 16); | |
tmp0.store(out_ptr0 + static_cast<long>(x2 + (64L*x1) + (768L*x0))); | |
} | |
} | |
} | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg0_1, = args | |
args.clear() | |
assert_size_stride(arg0_1, (1, 832, 12, 64), (638976, 64, 53248, 1)) | |
buf0 = empty_strided_cpu((1, 832, 12, 64), (638976, 768, 64, 1), torch.float32) | |
cpp_fused_clone_0(arg0_1, buf0) | |
del arg0_1 | |
return (reinterpret_tensor(buf0, (1, 832, 768), (638976, 768, 1), 0), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
arg0_1 = rand_strided((1, 832, 12, 64), (638976, 64, 53248, 1), device='cpu', dtype=torch.float32) | |
fn = lambda: call([arg0_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:09.710000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:09.710000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "832b6bdf2f2092cb0e2ca7f3e3a30237"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0) | |
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984) | |
| | +- LENGTH_CHECK: len(L['___stack0']) == 2 | |
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0) | |
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 12, 64], stride=[638976, 64, 53248, 1]) | |
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False | |
| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1) | |
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][1], 7636800) | |
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size) | |
| | +- EQUALS_MATCH: L['batch_size'] == 1 | |
| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length) | |
| | +- EQUALS_MATCH: L['from_seq_length'] == 832 | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
V0627 17:31:09.710000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "16/0", "frame_key": "21", "co_name": "torch_dynamo_resume_in_forward_at_472", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 11, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 4, "graph_input_count": 1, "start_time": 1719534669.5804062, "entire_frame_compile_time_s": 0.13004136085510254, "backend_compile_time_s": 0.12020564079284668, "inductor_compile_time_s": 0.09919452667236328, "code_gen_time_s": 0.08350419998168945, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.711000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30911a30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 42, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 42, "id": 4, "source": "L['hidden_states']"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.731000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "l_hidden_states_": [1, 832, 768], "hidden_states": [1, 832, 768], "hidden_states_1": [1, 832, 768], "add": [1, 832, 768], "hidden_states_2": [1, 832, 768]}}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "5cbaeaa3b94e9560f38738cbbbf2efd6"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu", L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
l_stack0_0_ = L_stack0_0_ | |
l_hidden_states_ = L_hidden_states_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states) | |
hidden_states: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(l_stack0_0_); l_stack0_0_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1317 in forward, code: hidden_states = self.dropout(hidden_states) | |
hidden_states_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states); hidden_states = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor) | |
add: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_1 + l_hidden_states_; hidden_states_1 = l_hidden_states_ = None | |
hidden_states_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add); add = None | |
return (hidden_states_2,) | |
V0627 17:31:09.771000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "1cd1232b8ea80a91453ce72d7309f42c"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768][1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states) | |
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None | |
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None | |
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None | |
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None | |
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None | |
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1317 in forward, code: hidden_states = self.dropout(hidden_states) | |
clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_1); view_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor) | |
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg5_1); clone = arg5_1 = None | |
var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True) | |
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0] | |
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None | |
add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None | |
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1); add_1 = None | |
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1); add = getitem_1 = None | |
mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None | |
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, arg2_1); mul = arg2_1 = None | |
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, arg3_1); mul_1 = arg3_1 = None | |
return (add_2,) | |
V0627 17:31:09.822000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "e2b95d7d56d3ed2a8ad6cfb284f41613"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# No stacktrace found for following nodes | |
_frozen_param2: "f32[768][1]cpu" = self._frozen_param2 | |
_frozen_param3: "f32[768][1]cpu" = self._frozen_param3 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states) | |
_frozen_param4: "bf16[768][1]cpu" = self._frozen_param4 | |
# No stacktrace found for following nodes | |
_frozen_param6: "bf16[768, 768][1, 0]cpu" = self._frozen_param6 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states) | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None | |
_linear_pointwise_default_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param6, _frozen_param4, 'none', [], ''); convert_element_type_2 = _frozen_param6 = _frozen_param4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor) | |
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_1, arg5_1); _linear_pointwise_default_1 = arg5_1 = None | |
var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True) | |
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0] | |
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None | |
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1); add = getitem_1 = None | |
add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None | |
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1); add_1 = None | |
mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None | |
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, _frozen_param2); mul = _frozen_param2 = None | |
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None | |
return (add_2,) | |
V0627 17:31:09.919000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ot/cotc6xdws22smodcitafp7uurqklfk4ux2ijtnzkqwktzn6c3wk3.py"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "320320d26970537cad9fa4b92420ab78"} | |
# AOT ID: ['10_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
_frozen_param2 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44fe0 | |
_frozen_param3 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45080 | |
_frozen_param4 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e30928a90 | |
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e303a2cf0 | |
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const float* in_ptr0, | |
bfloat16* out_ptr0) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); | |
auto tmp1 = at::vec::convert<bfloat16>(tmp0); | |
tmp1.store(out_ptr0 + static_cast<long>(x0), 16); | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused_add_native_layer_norm_1 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
const float* in_ptr1, | |
const float* in_ptr2, | |
const float* in_ptr3, | |
float* out_ptr0, | |
float* out_ptr1, | |
float* out_ptr2) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
Welford<float> tmp_acc0 = Welford<float>(); | |
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>(); | |
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L)); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp3 = tmp1 + tmp2; | |
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps); | |
} | |
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec)); | |
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean); | |
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp4 = out_ptr0[static_cast<long>(x0)]; | |
auto tmp7 = out_ptr1[static_cast<long>(x0)]; | |
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16); | |
auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp3 = tmp1 + tmp2; | |
auto tmp5 = at::vec::Vectorized<float>(tmp4); | |
auto tmp6 = tmp3 - tmp5; | |
auto tmp8 = static_cast<float>(768.0); | |
auto tmp9 = tmp7 / tmp8; | |
auto tmp10 = static_cast<float>(1e-12); | |
auto tmp11 = decltype(tmp9)(tmp9 + tmp10); | |
auto tmp12 = 1 / std::sqrt(tmp11); | |
auto tmp13 = at::vec::Vectorized<float>(tmp12); | |
auto tmp14 = tmp6 * tmp13; | |
auto tmp16 = tmp14 * tmp15; | |
auto tmp18 = tmp16 + tmp17; | |
tmp18.store(out_ptr2 + static_cast<long>(x1 + (768L*x0))); | |
} | |
} | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg4_1, arg5_1 = args | |
args.clear() | |
assert_size_stride(arg4_1, (1, 832, 768), (638976, 768, 1)) | |
assert_size_stride(arg5_1, (1, 832, 768), (638976, 768, 1)) | |
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16) | |
cpp_fused__to_copy_0(arg4_1, buf0) | |
del arg4_1 | |
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param6, _frozen_param4, 'none', [-1], '') | |
del buf0 | |
buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32) | |
buf3 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32) | |
buf5 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32) | |
cpp_fused_add_native_layer_norm_1(buf1, arg5_1, _frozen_param2, _frozen_param3, buf2, buf3, buf5) | |
del arg5_1 | |
return (buf5, ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
global _frozen_param2 | |
_frozen_param2 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32) | |
global _frozen_param3 | |
_frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32) | |
global _frozen_param4 | |
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param6 | |
_frozen_param6 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
arg4_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32) | |
arg5_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32) | |
fn = lambda: call([arg4_1, arg5_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:09.931000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:09.932000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "cc9600447bc28ad3ba928d7719c0654d"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272272) | |
| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__) | |
| | | | | | +- GuardManager: source=L['self'].output.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202271456) | |
| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].output.dense.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202271168) | |
| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].output.dropout.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202271504) | |
| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks) | |
| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks) | |
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0) | |
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984) | |
| | +- LENGTH_CHECK: len(L['___stack0']) == 1 | |
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0) | |
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False | |
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states']) | |
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states) | |
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states']) | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800) | |
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104) | |
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks | |
V0627 17:31:09.932000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "17/0", "frame_key": "22", "co_name": "torch_dynamo_resume_in_forward_at_1401", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1401, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 34, "shape_env_guard_count": 0, "graph_op_count": 4, "graph_node_count": 7, "graph_input_count": 2, "start_time": 1719534669.711534, "entire_frame_compile_time_s": 0.22069621086120605, "backend_compile_time_s": 0.1933588981628418, "inductor_compile_time_s": 0.11173701286315918, "code_gen_time_s": 0.08121824264526367, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.933000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 44, "size": 2555904}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e309a5760>", "describer_id": 44}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 44, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:09.984000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "hidden_states": [1, 832, 3072], "mul": [1, 832, 3072], "pow_1": [1, 832, 3072], "mul_1": [1, 832, 3072], "add": [1, 832, 3072], "mul_2": [1, 832, 3072], "tanh": [1, 832, 3072], "add_1": [1, 832, 3072], "hidden_states_1": [1, 832, 3072], "hidden_states_2": [1, 832, 768], "hidden_states_3": [1, 832, 768], "add_2": [1, 832, 768], "hidden_states_4": [1, 832, 768]}}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "28432eb8c22b77d39d8eae55f0796aec"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
l_stack0_0_ = L_stack0_0_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states) | |
hidden_states: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = self.L__self___intermediate_dense(l_stack0_0_) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) | |
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.5 * hidden_states | |
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.pow(hidden_states, 3.0) | |
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.044715 * pow_1; pow_1 = None | |
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = hidden_states + mul_1; hidden_states = mul_1 = None | |
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.7978845608028654 * add; add = None | |
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.tanh(mul_2); mul_2 = None | |
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 1.0 + tanh; tanh = None | |
hidden_states_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = mul * add_1; mul = add_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states) | |
hidden_states_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(hidden_states_1); hidden_states_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1436 in forward, code: hidden_states = self.dropout(hidden_states) | |
hidden_states_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states_2); hidden_states_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor) | |
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_3 + l_stack0_0_; hidden_states_3 = l_stack0_0_ = None | |
hidden_states_4: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add_2); add_2 = None | |
return (hidden_states_4,) | |
V0627 17:31:10.051000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "c94939d327a02b378b1745a04171ca4e"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[3072, 768][768, 1]cpu", arg1_1: "f32[3072][1]cpu", arg2_1: "f32[768, 3072][3072, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states) | |
convert_element_type: "bf16[3072][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None | |
convert_element_type_1: "bf16[3072, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16) | |
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None | |
permute: "bf16[768, 3072][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None | |
addmm: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None | |
view_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 3072]); addmm = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) | |
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 0.5) | |
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(view_1, 3.0) | |
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715); pow_1 = None | |
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(view_1, mul_1); view_1 = mul_1 = None | |
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654); add = None | |
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2); mul_2 = None | |
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0); tanh = None | |
mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1); mul = add_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states) | |
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None | |
convert_element_type_7: "bf16[768, 3072][3072, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None | |
view_2: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.view.default(mul_3, [832, 3072]); mul_3 = None | |
permute_1: "bf16[3072, 768][1, 3072]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None | |
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_2, permute_1); convert_element_type_6 = view_2 = permute_1 = None | |
view_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1436 in forward, code: hidden_states = self.dropout(hidden_states) | |
clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_3); view_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor) | |
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg6_1); clone = arg6_1 = None | |
var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True) | |
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0] | |
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None | |
add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None | |
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3); add_3 = None | |
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1); add_2 = getitem_1 = None | |
mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None | |
mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, arg4_1); mul_4 = arg4_1 = None | |
add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, arg5_1); mul_5 = arg5_1 = None | |
return (add_4,) | |
V0627 17:31:10.133000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "8400618ae53b7968980ef85788f68b83"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# No stacktrace found for following nodes | |
_frozen_param4: "f32[768][1]cpu" = self._frozen_param4 | |
_frozen_param5: "f32[768][1]cpu" = self._frozen_param5 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states) | |
_frozen_param6: "bf16[3072][1]cpu" = self._frozen_param6 | |
# No stacktrace found for following nodes | |
_frozen_param10: "bf16[3072, 768][1, 0]cpu" = self._frozen_param10 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states) | |
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8 | |
# No stacktrace found for following nodes | |
_frozen_param11: "bf16[768, 3072][1, 0]cpu" = self._frozen_param11 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states) | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16) | |
_linear_pointwise_default_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param10, _frozen_param6, 'none', [], ''); convert_element_type_2 = _frozen_param10 = _frozen_param6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) | |
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(_linear_pointwise_default_3, 0.5) | |
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(_linear_pointwise_default_3, 3.0) | |
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715); pow_1 = None | |
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_3, mul_1); _linear_pointwise_default_3 = mul_1 = None | |
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654); add = None | |
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2); mul_2 = None | |
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0); tanh = None | |
mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1); mul = add_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states) | |
_linear_pointwise_default_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(mul_3, _frozen_param11, _frozen_param8, 'none', [], ''); mul_3 = _frozen_param11 = _frozen_param8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor) | |
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_2, arg6_1); _linear_pointwise_default_2 = arg6_1 = None | |
var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True) | |
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0] | |
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None | |
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1); add_2 = getitem_1 = None | |
add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None | |
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3); add_3 = None | |
mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None | |
mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, _frozen_param4); mul_4 = _frozen_param4 = None | |
add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, _frozen_param5); mul_5 = _frozen_param5 = None | |
return (add_4,) | |
V0627 17:31:10.240000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/yq/cyqi5vcdu2onzw25fkzgawphp3sm6xov6rt4wwjoshykrnlnqms3.py"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "220b8ade00d54ed30a9ebc3492a6ee4d"} | |
# AOT ID: ['11_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
_frozen_param4 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45300 | |
_frozen_param5 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45350 | |
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (3072,) (1,) 7f2e301a7600 | |
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (3072, 768) (1, 0) 7f2e3013c8b0 | |
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e3013f830 | |
_frozen_param11 = None # device(type='cpu') torch.bfloat16 (768, 3072) (1, 0) 7f2e3013c2c0 | |
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const float* in_ptr0, | |
bfloat16* out_ptr0) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); | |
auto tmp1 = at::vec::convert<bfloat16>(tmp0); | |
tmp1.store(out_ptr0 + static_cast<long>(x0), 16); | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused_add_mul_pow_tanh_1 = async_compile.cpp_pybinding(['bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(bfloat16* in_out_ptr0) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2555904L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_out_ptr0 + static_cast<long>(x0), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp2 = static_cast<float>(0.5); | |
auto tmp3 = at::vec::Vectorized<float>(tmp2); | |
auto tmp4 = tmp1 * tmp3; | |
auto tmp5 = tmp1 * tmp1; | |
auto tmp6 = tmp5 * tmp1; | |
auto tmp7 = static_cast<float>(0.044715); | |
auto tmp8 = at::vec::Vectorized<float>(tmp7); | |
auto tmp9 = tmp6 * tmp8; | |
auto tmp10 = tmp1 + tmp9; | |
auto tmp11 = static_cast<float>(0.7978845608028654); | |
auto tmp12 = at::vec::Vectorized<float>(tmp11); | |
auto tmp13 = tmp10 * tmp12; | |
auto tmp14 = decltype(tmp13)(2) / (decltype(tmp13)(1) + (decltype(tmp13)(-2) * tmp13).exp()) - decltype(tmp13)(1); | |
auto tmp15 = static_cast<float>(1.0); | |
auto tmp16 = at::vec::Vectorized<float>(tmp15); | |
auto tmp17 = tmp14 + tmp16; | |
auto tmp18 = tmp4 * tmp17; | |
auto tmp19 = at::vec::convert<bfloat16>(tmp18); | |
tmp19.store(in_out_ptr0 + static_cast<long>(x0), 16); | |
} | |
} | |
} | |
} | |
''') | |
cpp_fused_add_native_layer_norm_2 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const bfloat16* in_ptr0, | |
const float* in_ptr1, | |
const float* in_ptr2, | |
const float* in_ptr3, | |
float* out_ptr0, | |
float* out_ptr1, | |
float* out_ptr2) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L)) | |
{ | |
{ | |
Welford<float> tmp_acc0 = Welford<float>(); | |
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>(); | |
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L)); | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp3 = tmp1 + tmp2; | |
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps); | |
} | |
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec)); | |
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean); | |
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2); | |
} | |
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16); | |
auto tmp4 = out_ptr0[static_cast<long>(x0)]; | |
auto tmp7 = out_ptr1[static_cast<long>(x0)]; | |
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16); | |
auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16); | |
auto tmp1 = at::vec::convert<float>(tmp0); | |
auto tmp3 = tmp1 + tmp2; | |
auto tmp5 = at::vec::Vectorized<float>(tmp4); | |
auto tmp6 = tmp3 - tmp5; | |
auto tmp8 = static_cast<float>(768.0); | |
auto tmp9 = tmp7 / tmp8; | |
auto tmp10 = static_cast<float>(1e-12); | |
auto tmp11 = decltype(tmp9)(tmp9 + tmp10); | |
auto tmp12 = 1 / std::sqrt(tmp11); | |
auto tmp13 = at::vec::Vectorized<float>(tmp12); | |
auto tmp14 = tmp6 * tmp13; | |
auto tmp16 = tmp14 * tmp15; | |
auto tmp18 = tmp16 + tmp17; | |
tmp18.store(out_ptr2 + static_cast<long>(x1 + (768L*x0))); | |
} | |
} | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg6_1, = args | |
args.clear() | |
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1)) | |
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16) | |
cpp_fused__to_copy_0(arg6_1, buf0) | |
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param10, _frozen_param6, 'none', [-1], '') | |
del buf0 | |
buf2 = buf1; del buf1 # reuse | |
cpp_fused_add_mul_pow_tanh_1(buf2) | |
buf3 = torch.ops.mkldnn._linear_pointwise(buf2, _frozen_param11, _frozen_param8, 'none', [-1], '') | |
del buf2 | |
buf4 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32) | |
buf5 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32) | |
buf7 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32) | |
cpp_fused_add_native_layer_norm_2(buf3, arg6_1, _frozen_param4, _frozen_param5, buf4, buf5, buf7) | |
del arg6_1 | |
return (buf7, ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
global _frozen_param4 | |
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32) | |
global _frozen_param5 | |
_frozen_param5 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32) | |
global _frozen_param6 | |
_frozen_param6 = rand_strided((3072, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param10 | |
_frozen_param10 = rand_strided((3072, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param8 | |
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param11 | |
_frozen_param11 = rand_strided((768, 3072), (1, 0), device='cpu', dtype=torch.bfloat16) | |
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32) | |
fn = lambda: call([arg6_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:10.257000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:10.258000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "f3efa14ea8c088430fc033af17fce04d"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272320) | |
| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__) | |
| | | | | | +- GuardManager: source=L['self'].output.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202267808) | |
| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].output.dense.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202268288) | |
| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].output.dropout.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202268912) | |
| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks) | |
| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks) | |
| | | | +- GuardManager: source=L['self'].intermediate, accessed_by=DictGetItemGuardAccessor(intermediate) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate, 139839202275440) | |
| | | | | +- GuardManager: source=L['self'].intermediate.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.__dict__) | |
| | | | | | +- GuardManager: source=L['self'].intermediate.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.training, 7685824) | |
| | | | | | +- GuardManager: source=L['self'].intermediate._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | | | | +- GuardManager: source=L['self'].intermediate.dense, accessed_by=DictGetItemGuardAccessor(dense) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.dense, 139839202270544) | |
| | | | | | | | +- GuardManager: source=L['self'].intermediate.dense.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.dense.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.dense.training, 7685824) | |
| | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn, accessed_by=DictGetItemGuardAccessor(intermediate_act_fn) | |
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.intermediate_act_fn, 139839202267616) | |
| | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.intermediate_act_fn.__dict__) | |
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.intermediate_act_fn.training, 7685824) | |
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks) | |
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks) | |
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks) | |
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks) | |
| | | | | | +- GuardManager: source=L['self'].intermediate._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].intermediate._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks) | |
| | | | | | +- GuardManager: source=L['self'].intermediate._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks) | |
| | | | | | +- GuardManager: source=L['self'].intermediate._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks) | |
| | | +- GuardManager: source=L['self'].is_decoder, accessed_by=DictGetItemGuardAccessor(is_decoder) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].is_decoder, 7685824) | |
| | | +- GuardManager: source=L['self'].seq_len_dim, accessed_by=DictGetItemGuardAccessor(seq_len_dim) | |
| | | | +- EQUALS_MATCH: L['self'].seq_len_dim == 1 | |
| | | +- GuardManager: source=L['self'].chunk_size_feed_forward, accessed_by=DictGetItemGuardAccessor(chunk_size_feed_forward) | |
| | | | +- EQUALS_MATCH: L['self'].chunk_size_feed_forward == 0 | |
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0) | |
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984) | |
| | +- LENGTH_CHECK: len(L['___stack0']) == 1 | |
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0) | |
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['apply_chunking_to_forward'], accessed_by=DictGetItemGuardAccessor(apply_chunking_to_forward) | |
| | | +- GuardManager: source=G['apply_chunking_to_forward'].__code__, accessed_by=GetAttrGuardAccessor(__code__) | |
| | | | +- ID_MATCH: ___check_obj_id(G['apply_chunking_to_forward'].__code__, 139839646455872) | |
| | +- GuardManager: source=G['__builtins_dict___52'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___52) | |
| | | +- GuardManager: source=G['__builtins_dict___52']['len'], accessed_by=DictGetItemGuardAccessor(len) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___52']['len'], 139845257826832) | |
| | +- GuardManager: source=G['__import_transformers_dot_activations'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_activations) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'], 139839665031744) | |
| | | +- GuardManager: source=G['__import_transformers_dot_activations'].math, accessed_by=GetAttrGuardAccessor(math) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math, 139845236089744) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.pi, accessed_by=GetAttrGuardAccessor(pi) | |
| | | | | +- EQUALS_MATCH: G['__import_transformers_dot_activations'].math.pi == 3.141592653589793 | |
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.sqrt, accessed_by=GetAttrGuardAccessor(sqrt) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math.sqrt, 139845236093344) | |
| | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch, accessed_by=GetAttrGuardAccessor(torch) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch, 139845236322800) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.pow, accessed_by=GetAttrGuardAccessor(pow) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.pow, 139845228824512) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.tanh, accessed_by=GetAttrGuardAccessor(tanh) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.tanh, 139845228799744) | |
| | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_pytorch_utils) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'], 139839703287984) | |
| | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect, accessed_by=GetAttrGuardAccessor(inspect) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect, 139845236517488) | |
| | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature, accessed_by=GetAttrGuardAccessor(signature) | |
| | | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, accessed_by=GetAttrGuardAccessor(__code__) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, 139845231798640) | |
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module) | |
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800) | |
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104) | |
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state) | |
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088) | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks | |
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks) | |
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696) | |
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks | |
V0627 17:31:10.259000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "18/0", "frame_key": "23", "co_name": "torch_dynamo_resume_in_forward_at_1488", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1488, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 69, "shape_env_guard_count": 0, "graph_op_count": 13, "graph_node_count": 15, "graph_input_count": 1, "start_time": 1719534669.9335542, "entire_frame_compile_time_s": 0.3254525661468506, "backend_compile_time_s": 0.26067519187927246, "inductor_compile_time_s": 0.1273505687713623, "code_gen_time_s": 0.07860469818115234, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0} | |
V0627 17:31:10.262000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 46, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 46, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 46, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.306000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 47, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.307000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.307000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 47, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 47, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.310000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.311000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.312000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.312000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.317000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:10.317000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "9d228664307649151c1145ad228290a7"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274768) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202265168) | |
| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].attention.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention.training, 7685824) | |
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1]) | |
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1]) | |
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800) | |
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states) | |
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800) | |
| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value) | |
| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1]) | |
| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask']) | |
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800) | |
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800) | |
V0627 17:31:10.318000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "7/1", "frame_key": "24", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534670.2629929, "entire_frame_compile_time_s": 0.05506253242492676, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.04132270812988281, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.318000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 48, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 48, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 48, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.325000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.325000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 49, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 49, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.357000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 49, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.357000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.358000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.358000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.360000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.360000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.362000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.362000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.365000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"): | |
l_band_mask_ = L_band_mask_ | |
l_from_mask_ = L_from_mask_ | |
l_to_mask_ = L_to_mask_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1383 in forward, code: band_mask = band_mask.to(hidden_states.dtype) | |
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = l_band_mask_.to(torch.float32); l_band_mask_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1385 in forward, code: from_mask = from_mask.to(hidden_states.dtype) | |
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_from_mask_.to(torch.float32); l_from_mask_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1387 in forward, code: to_mask = to_mask.to(hidden_states.dtype) | |
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_to_mask_.to(torch.float32); l_to_mask_ = None | |
return (band_mask, from_mask, to_mask) | |
V0627 17:31:10.380000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"): | |
return (arg0_1, arg1_1, arg2_1) | |
V0627 17:31:10.392000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:10.392000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "78c6200e495d09cd995b82c1e530d62e"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202265168) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202264976) | |
| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].self.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].self.training, 7685824) | |
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type) | |
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse' | |
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1]) | |
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1]) | |
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states) | |
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states) | |
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800) | |
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask) | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
V0627 17:31:10.392000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "8/1", "frame_key": "25", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534670.3189635, "entire_frame_compile_time_s": 0.07366013526916504, "backend_compile_time_s": 0.02211451530456543, "inductor_compile_time_s": 0.00025773048400878906, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03475379943847656, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.393000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 51, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 51}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 51, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 52, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 52, "size": 442368}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 52, "size": 3328}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.437000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.437000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.438000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.438000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.441000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
l_hidden_states_ = L_hidden_states_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64); l__self___query = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3); x = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64); l__self___key = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3); x_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_); l_hidden_states_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64); l__self___value = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3); x_2 = None | |
return (query_layer, key_layer, value_layer) | |
V0627 17:31:10.491000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None | |
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16) | |
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None | |
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None | |
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None | |
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]); view_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None | |
convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None | |
convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16) | |
view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]); convert_element_type_8 = None | |
permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None | |
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2); convert_element_type_6 = view_3 = permute_2 = None | |
view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]); view_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16); arg5_1 = None | |
convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None | |
convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None | |
view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]); convert_element_type_14 = None | |
permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]); convert_element_type_13 = None | |
addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4); convert_element_type_12 = view_6 = permute_4 = None | |
view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]); addmm_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]); view_7 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None | |
return (permute_1, permute_3, permute_5) | |
V0627 17:31:10.557000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
_frozen_param6: "bf16[768][1]cpu" = self._frozen_param6 | |
# No stacktrace found for following nodes | |
_frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8 | |
# No stacktrace found for following nodes | |
_frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
_frozen_param10: "bf16[768][1]cpu" = self._frozen_param10 | |
# No stacktrace found for following nodes | |
_frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14 | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states)) | |
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None | |
_linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states)) | |
_linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states)) | |
_linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape) | |
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3) | |
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None | |
return (permute_1, permute_3, permute_5) | |
V0627 17:31:10.578000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/wm/cwm7ec52zxt6bl7gt2h7sahtj5wsw4g7ez4jvozekjwtw7nqdl3v.py"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "c068758cb8977ae26fcf611c09070a9a"} | |
# AOT ID: ['13_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300d5490 | |
_frozen_param12 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300daed0 | |
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a0c70 | |
_frozen_param13 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300a3e70 | |
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a1490 | |
_frozen_param14 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300dbfb0 | |
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], ''' | |
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h" | |
extern "C" void kernel(const float* in_ptr0, | |
bfloat16* out_ptr0) | |
{ | |
#pragma omp parallel num_threads(56) | |
{ | |
int tid = omp_get_thread_num(); | |
{ | |
#pragma omp for | |
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L)) | |
{ | |
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); | |
auto tmp1 = at::vec::convert<bfloat16>(tmp0); | |
tmp1.store(out_ptr0 + static_cast<long>(x0), 16); | |
} | |
} | |
} | |
} | |
''') | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg6_1, = args | |
args.clear() | |
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1)) | |
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16) | |
cpp_fused__to_copy_0(arg6_1, buf0) | |
del arg6_1 | |
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '') | |
buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '') | |
buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '') | |
return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
global _frozen_param6 | |
_frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param12 | |
_frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param8 | |
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param13 | |
_frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param10 | |
_frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16) | |
global _frozen_param14 | |
_frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16) | |
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32) | |
fn = lambda: call([arg6_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:10.587000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:10.588000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "da04fa8fdd18f2f15ae08b9dbbb492e0"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules) | |
| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202265648) | |
| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].key.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].key.training, 7685824) | |
| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202265696) | |
| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].query.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].query.training, 7685824) | |
| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value) | |
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202264592) | |
| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | | | | +- GuardManager: source=L['self'].value.training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].value.training, 7685824) | |
| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed) | |
| | | | +- EQUALS_MATCH: L['self'].seed == 1 | |
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size) | |
| | | | +- EQUALS_MATCH: L['self'].block_size == 64 | |
| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks) | |
| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3 | |
| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size) | |
| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64 | |
| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads) | |
| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12 | |
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1]) | |
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1]) | |
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states) | |
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1]) | |
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask) | |
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1]) | |
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask']) | |
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions) | |
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824) | |
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask) | |
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask'] | |
V0627 17:31:10.588000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "9/1", "frame_key": "26", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534670.3936255, "entire_frame_compile_time_s": 0.19471240043640137, "backend_compile_time_s": 0.1402432918548584, "inductor_compile_time_s": 0.033010005950927734, "code_gen_time_s": 0.012862920761108398, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.024178743362426758, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.589000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.614000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:10.614000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "1ea07e64f0c0d490d94336fa323c05e9"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed) | |
| | +- EQUALS_MATCH: L['seed'] == 1 | |
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size) | |
| | +- EQUALS_MATCH: L['batch_size'] == 1 | |
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len) | |
| | +- EQUALS_MATCH: L['to_seq_len'] == 832 | |
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len) | |
| | +- EQUALS_MATCH: L['from_seq_len'] == 832 | |
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size) | |
| | +- EQUALS_MATCH: L['to_block_size'] == 64 | |
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size) | |
| | +- EQUALS_MATCH: L['from_block_size'] == 64 | |
| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size) | |
| | +- EQUALS_MATCH: L['attention_head_size'] == 64 | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np) | |
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488) | |
| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random) | |
| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464) | |
| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed) | |
| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264) | |
| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math) | |
| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744) | |
| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt) | |
| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344) | |
V0627 17:31:10.615000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "10/1", "frame_key": "27", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 516, "cache_size": 1, "accumulated_cache_size": 1, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.5898829, "entire_frame_compile_time_s": 0.02506852149963379, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.009800434112548828, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.615000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.647000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:10.648000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "b6b8c289bd494c29f862b3959f02ec26"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads) | |
| | +- EQUALS_MATCH: L['n_heads'] == 12 | |
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len) | |
| | +- EQUALS_MATCH: L['to_seq_len'] == 832 | |
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len) | |
| | +- EQUALS_MATCH: L['from_seq_len'] == 832 | |
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks) | |
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3 | |
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size) | |
| | +- EQUALS_MATCH: L['to_block_size'] == 64 | |
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size) | |
| | +- EQUALS_MATCH: L['from_block_size'] == 64 | |
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length) | |
| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800) | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['__builtins_dict___69'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___69) | |
| | | +- GuardManager: source=G['__builtins_dict___69']['int'], accessed_by=DictGetItemGuardAccessor(int) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___69']['int'], 7648640) | |
V0627 17:31:10.648000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "11/1", "frame_key": "28", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.6159284, "entire_frame_compile_time_s": 0.03219175338745117, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.01743292808532715, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1} | |
V0627 17:31:10.649000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300921b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30091ee0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30092c00>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30152570>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0040>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0950>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0680>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1530>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1e90>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc2840>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.673000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.673000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc30b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.674000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.675000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.676000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3470>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.676000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.681000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"): | |
l_stack0_0_ = L_stack0_0_ | |
l_stack0_1_ = L_stack0_1_ | |
l_stack0_2_ = L_stack0_2_ | |
l_stack0_3_ = L_stack0_3_ | |
l_stack0_4_ = L_stack0_4_ | |
l_stack0_5_ = L_stack0_5_ | |
l_stack0_6_ = L_stack0_6_ | |
l_stack0_7_ = L_stack0_7_ | |
l_stack0_8_ = L_stack0_8_ | |
l_stack0_9_ = L_stack0_9_ | |
l_stack0_10_ = L_stack0_10_ | |
l_stack0_11_ = L_stack0_11_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] | |
wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None))); l_stack0_0_ = None | |
wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None))); l_stack0_1_ = None | |
wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None))); l_stack0_2_ = None | |
wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None))); l_stack0_3_ = None | |
wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None))); l_stack0_4_ = None | |
wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None))); l_stack0_5_ = None | |
wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None))); l_stack0_6_ = None | |
wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None))); l_stack0_7_ = None | |
wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None))); l_stack0_8_ = None | |
wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None))); l_stack0_9_ = None | |
wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None))); l_stack0_10_ = None | |
wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None))); l_stack0_11_ = None | |
return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11) | |
V0627 17:31:10.751000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] | |
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None | |
slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807); slice_1 = None | |
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None | |
slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None | |
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None | |
slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None | |
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None | |
slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None | |
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None | |
slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None | |
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None | |
slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None | |
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None | |
slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None | |
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None | |
slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None | |
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None | |
slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None | |
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None | |
slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None | |
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None | |
slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None | |
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None | |
slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None | |
return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24) | |
V0627 17:31:10.788000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"} | |
class <lambda>(torch.nn.Module): | |
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"): | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] | |
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None | |
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None | |
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None | |
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None | |
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None | |
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None | |
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None | |
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None | |
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None | |
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None | |
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None | |
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None | |
return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23) | |
V0627 17:31:10.802000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/of/cof3htzjwffvxd2lla7sn2ozynci436rdmah5vsvllsahmxz6qro.py"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "c06d796ae11c7e77048735efc71e26ca"} | |
# AOT ID: ['14_inference'] | |
from ctypes import c_void_p, c_long | |
import torch | |
import math | |
import random | |
import os | |
import tempfile | |
from math import inf, nan | |
from torch._inductor.hooks import run_intermediate_hooks | |
from torch._inductor.utils import maybe_profile | |
from torch._inductor.codegen.memory_planning import _align as align | |
from torch import device, empty_strided | |
from torch._inductor.async_compile import AsyncCompile | |
from torch._inductor.select_algorithm import extern_kernels | |
from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
aten = torch.ops.aten | |
inductor_ops = torch.ops.inductor | |
_quantized = torch.ops._quantized | |
assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor | |
async_compile = AsyncCompile() | |
async_compile.wait(globals()) | |
del async_compile | |
def call(args): | |
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args | |
args.clear() | |
assert_size_stride(arg0_1, (13, 3), (3, 1)) | |
assert_size_stride(arg1_1, (13, 3), (3, 1)) | |
assert_size_stride(arg2_1, (13, 3), (3, 1)) | |
assert_size_stride(arg3_1, (13, 3), (3, 1)) | |
assert_size_stride(arg4_1, (13, 3), (3, 1)) | |
assert_size_stride(arg5_1, (13, 3), (3, 1)) | |
assert_size_stride(arg6_1, (13, 3), (3, 1)) | |
assert_size_stride(arg7_1, (13, 3), (3, 1)) | |
assert_size_stride(arg8_1, (13, 3), (3, 1)) | |
assert_size_stride(arg9_1, (13, 3), (3, 1)) | |
assert_size_stride(arg10_1, (13, 3), (3, 1)) | |
assert_size_stride(arg11_1, (13, 3), (3, 1)) | |
return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), ) | |
def benchmark_compiled_module(times=10, repeat=10): | |
from torch._dynamo.testing import rand_strided | |
from torch._inductor.utils import print_performance | |
arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32) | |
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]) | |
return print_performance(fn, times=times, repeat=repeat) | |
if __name__ == "__main__": | |
from torch._inductor.wrapper_benchmark import compiled_module_main | |
compiled_module_main('hf_BigBird', benchmark_compiled_module) | |
V0627 17:31:10.810000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"} | |
[ | |
] | |
V0627 17:31:10.810000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "6e13f24b700fd79116617b1177bb6706"} | |
TREE_GUARD_MANAGER: | |
+- RootGuardManager | |
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards | |
| +- GLOBAL_STATE: ___check_global_state() | |
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self) | |
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976) | |
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor | |
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training) | |
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824) | |
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0) | |
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400) | |
| | +- LENGTH_CHECK: len(L['___stack0']) == 12 | |
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11) | |
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor | |
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1]) | |
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False | |
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11])) | |
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads) | |
| | +- EQUALS_MATCH: L['num_heads'] == 12 | |
| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks) | |
| | +- EQUALS_MATCH: L['num_blocks'] == 13 | |
| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top) | |
| | +- EQUALS_MATCH: L['global_block_top'] == 1 | |
| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom) | |
| | +- EQUALS_MATCH: L['global_block_bottom'] == 1 | |
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor | |
| | +- GuardManager: source=G['__builtins_dict___71'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___71) | |
| | | +- GuardManager: source=G['__builtins_dict___71']['range'], accessed_by=DictGetItemGuardAccessor(range) | |
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___71']['range'], 7632448) | |
V0627 17:31:10.810000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "14/1", "frame_key": "29", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534670.6493185, "entire_frame_compile_time_s": 0.16145634651184082, "backend_compile_time_s": 0.12227082252502441, "inductor_compile_time_s": 0.022518634796142578, "code_gen_time_s": 0.0035479068756103516, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.811000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff64360>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5e020>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5f5b0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300934c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff46cf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff44d60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffb1a80>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300dbbf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30090bd0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff67920>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3010>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc20c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 12, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 12, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014c20>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 12, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.833000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 13, "describer_id": 60, "size": 3328}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 14, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 13, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 14, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 15, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014220>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 15, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.841000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 15, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.842000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30015d50>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.842000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 16, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.856000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 17, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.856000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 17, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 16, "describer_id": 60, "size": 442368}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 18, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 18, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.936000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 19, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.936000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 19, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0} | |
V0627 17:31:10.952000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"} | |
class GraphModule(torch.nn.Module): | |
def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"): | |
l_stack0_0_ = L_stack0_0_ | |
l_stack0_1_ = L_stack0_1_ | |
l_stack0_2_ = L_stack0_2_ | |
l_stack0_3_ = L_stack0_3_ | |
l_stack0_4_ = L_stack0_4_ | |
l_stack0_5_ = L_stack0_5_ | |
l_stack0_6_ = L_stack0_6_ | |
l_stack0_7_ = L_stack0_7_ | |
l_stack0_8_ = L_stack0_8_ | |
l_stack0_9_ = L_stack0_9_ | |
l_stack0_10_ = L_stack0_10_ | |
l_stack0_11_ = L_stack0_11_ | |
l_query_layer_ = L_query_layer_ | |
l_from_blocked_mask_ = L_from_blocked_mask_ | |
l_key_layer_ = L_key_layer_ | |
l_value_layer_ = L_value_layer_ | |
l_to_mask_ = L_to_mask_ | |
l_band_mask_ = L_band_mask_ | |
l_from_mask_ = L_from_mask_ | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0) | |
rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0); l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) | |
rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64); rand_attn = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0) | |
unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0) | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) | |
rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.cat([rand_attn_1], dim = 0); rand_attn_1 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0] | |
i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0] | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
flatten: "i64[396][1]cpu" = i1.flatten(); i1 = None | |
getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten]; p1 = flatten = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) | |
rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]); getitem_2 = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) | |
rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192); rand_mask = None | |
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-pa |
View raw
(Sorry about that, but we can’t show files that are this big right now.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment