Skip to content

Instantly share code, notes, and snippets.

@leslie-fang-intel
Created June 28, 2024 00:32
Show Gist options
  • Save leslie-fang-intel/193bb1ec096e619ff441484f94a0e2a3 to your computer and use it in GitHub Desktop.
Save leslie-fang-intel/193bb1ec096e619ff441484f94a0e2a3 to your computer and use it in GitHub Desktop.
trace log for 128513
This file has been truncated, but you can view the full file.
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/torchbench.py", 0]}
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/common.py", 1]}
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/eval_frame.py", 2]}
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/convert_frame.py", 3]}
V0627 17:31:00.663000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.691000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 0, "size": 6552}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.692000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.692000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 0, "id": 0, "source": "L['inputs'][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 0, "size": 32768}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 0, "id": 1, "source": "L['mod'].bert.embeddings.token_type_ids"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.718000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "8f3f91fb1d48d67b1336de49ea694c74"}
class GraphModule(torch.nn.Module):
def forward(self):
# No stacktrace found for following nodes
_enter_autocast = torch.amp.autocast_mode._enter_autocast('cpu', None, True, None)
_exit_autocast = torch.amp.autocast_mode._exit_autocast(_enter_autocast); _enter_autocast = None
return ()
V0627 17:31:01.398000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "845c30ca0008a08ec62276cecc47183b"}
class <lambda>(torch.nn.Module):
def forward(self):
return ()
V0627 17:31:01.498000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:01.498000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "40c07a4da7b433b5416cc93985646719"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['mod'], accessed_by=DictGetItemGuardAccessor(mod)
| | +- ID_MATCH: ___check_obj_id(L['mod'], 139839714901824)
| | +- GuardManager: source=L['mod'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['mod'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['mod'].training, 7685824)
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- TYPE_MATCH: ___check_type_id(L['self'], 139842378438672)
| | +- GuardManager: source=L['self'].autocast, accessed_by=GetAttrGuardAccessor(autocast)
| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast, 139845255007760)
| | | +- GuardManager: source=L['self'].autocast.args, accessed_by=GetAttrGuardAccessor(args)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.args, 7625984)
| | | | +- LENGTH_CHECK: not L['self'].autocast.args
| | | +- GuardManager: source=L['self'].autocast.func, accessed_by=GetAttrGuardAccessor(func)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].autocast.func, 139844826956816)
| | | +- GuardManager: source=L['self'].autocast.keywords, accessed_by=GetAttrGuardAccessor(keywords)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.keywords, 7646656)
| | | | +- GuardManager: source=L['self'].autocast.keywords['device_type'], accessed_by=DictGetItemGuardAccessor(device_type)
| | | | | +- EQUALS_MATCH: L['self'].autocast.keywords['device_type'] == 'cpu'
| | +- GuardManager: source=L['self'].autocast_arg, accessed_by=GetAttrGuardAccessor(autocast_arg)
| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast_arg, 7646656)
| | | +- DICT_LENGTH: not L['self'].autocast_arg
| +- GuardManager: source=L['inputs'], accessed_by=DictGetItemGuardAccessor(inputs)
| | +- TYPE_MATCH: ___check_type_id(L['inputs'], 7625984)
| | +- LENGTH_CHECK: len(L['inputs']) == 1
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___1'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___1)
| | | +- GuardManager: source=G['__builtins_dict___1']['dict'], accessed_by=DictGetItemGuardAccessor(dict)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['dict'], 7646656)
| | | +- GuardManager: source=G['__builtins_dict___1']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['isinstance'], 139845257826512)
V0627 17:31:01.498000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "forward_pass", "co_filename": "/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/torchbench.py", "co_firstlineno": 425, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534660.6636841, "entire_frame_compile_time_s": 0.8347411155700684, "backend_compile_time_s": 0.7748816013336182, "inductor_compile_time_s": 0.00018596649169921875, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.04396843910217285, "has_guarded_code": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.500000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/nn/modules/module.py", 4]}
V0627 17:31:01.500000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.513000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 6, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.513000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.514000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 6, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 6, "size": 32768}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 6, "id": 1, "source": "L['self'].bert.embeddings.token_type_ids"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 7, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 7}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 7, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.535000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:01.535000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "6017f86a7c776c49ca1dd7d3539605bb"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901824)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].bert, accessed_by=DictGetItemGuardAccessor(bert)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].bert, 139839714901584)
| | | | | +- GuardManager: source=L['self'].bert.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].bert.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].bert.training, 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7636800)
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800)
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)
V0627 17:31:01.535000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "1/0", "frame_key": "6", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2382, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534661.5002189, "entire_frame_compile_time_s": 0.03560638427734375, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.022696733474731445, "has_guarded_code": true}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.536000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", 5]}
V0627 17:31:01.536000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 8, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 8, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 8, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 8, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 9, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 9, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.562000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 9, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.563000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.563000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 9, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.566000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"attention_mask": [1, 819], "l__self___embeddings_token_type_ids": [1, 4096], "buffered_token_type_ids": [1, 819], "buffered_token_type_ids_expanded": [1, 819]}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "6b7bec0701d22225fb67e6f1bfb9dc36"}
class GraphModule(torch.nn.Module):
def forward(self):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
attention_mask: "f32[1, 819][819, 1]cpu" = torch.ones((1, 819), device = device(type='cpu'))
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
l__self___embeddings_token_type_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_token_type_ids
buffered_token_type_ids: "i64[1, 819][4096, 1]cpu" = l__self___embeddings_token_type_ids[(slice(None, None, None), slice(None, 819, None))]; l__self___embeddings_token_type_ids = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
buffered_token_type_ids_expanded: "i64[1, 819][4096, 1]cpu" = buffered_token_type_ids.expand(1, 819); buffered_token_type_ids = None
return (attention_mask, buffered_token_type_ids_expanded)
V0627 17:31:01.581000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "1b3fb2899c356f991117f2262727f0ef"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
full: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
slice_1: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 0, 9223372036854775807); arg0_1 = None
slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 819); slice_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None
return (full, expand)
V0627 17:31:01.707000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "4a69dc4d0dfb43287c6abf210e06617e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
full_default: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819); arg0_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None
return (full_default, expand)
V0627 17:31:02.787000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ko/ckovbmwilyxv4sl5d74oe6jsmj25rub5gzv5kco7u66lavi64ivy.py"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0244b4281966e5f52ba168279eb45118"}
# AOT ID: ['1_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused_ones_0 = async_compile.cpp_pybinding(['float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(float* out_ptr0)
{
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(816L); x0+=static_cast<long>(16L))
{
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0);
tmp1.store(out_ptr0 + static_cast<long>(x0));
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(816L); x0<static_cast<long>(819L); x0+=static_cast<long>(1L))
{
auto tmp0 = static_cast<float>(1.0);
out_ptr0[static_cast<long>(x0)] = tmp0;
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (1, 4096), (4096, 1))
buf0 = empty_strided_cpu((1, 819), (819, 1), torch.float32)
cpp_fused_ones_0(buf0)
return (buf0, reinterpret_tensor(arg0_1, (1, 819), (4096, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((1, 4096), (4096, 1), device='cpu', dtype=torch.int64)
fn = lambda: call([arg0_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:02.814000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:02.815000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0b1c2f71c2e67149726041714c77db6e"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208)
| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers)
| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_ids, accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_ids, 139838528701520)
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856)
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800)
| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values)
| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800)
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
| | | +- GuardManager: source=G['torch'].ones, accessed_by=GetAttrGuardAccessor(ones)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].ones, 139845228734288)
| | +- GuardManager: source=G['__import_torch'], accessed_by=DictGetItemGuardAccessor(__import_torch)
| | | +- GuardManager: source=G['__import_torch'].fx, accessed_by=GetAttrGuardAccessor(fx)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx, 139842407409488)
| | | | +- GuardManager: source=G['__import_torch'].fx.Proxy, accessed_by=GetAttrGuardAccessor(Proxy)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx.Proxy, 139842429035536)
| | | +- GuardManager: source=G['__import_torch']._dynamo, accessed_by=GetAttrGuardAccessor(_dynamo)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo, 139839776121264)
| | | | +- GuardManager: source=G['__import_torch']._dynamo.is_compiling, accessed_by=GetAttrGuardAccessor(is_compiling)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo.is_compiling, 139839726529856)
| | +- GuardManager: source=G['__builtins_dict___9'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___9)
| | | +- GuardManager: source=G['__builtins_dict___9']['hasattr'], accessed_by=DictGetItemGuardAccessor(hasattr)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['hasattr'], 139845257826112)
| | | +- GuardManager: source=G['__builtins_dict___9']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['isinstance'], 139845257826512)
| | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_modeling_utils)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'], 139839661201088)
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit, accessed_by=GetAttrGuardAccessor(jit)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit, 139842414949968)
| | | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, accessed_by=GetAttrGuardAccessor(is_tracing)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, 139842413687088)
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy, accessed_by=GetAttrGuardAccessor(is_torch_fx_proxy)
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, 139839683265264)
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling, accessed_by=GetAttrGuardAccessor(is_torchdynamo_compiling)
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, 139839683236192)
| | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_utils_dot_import_utils)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'], 139839683217824)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_available, accessed_by=GetAttrGuardAccessor(_torch_available)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_available, 7685856)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available, accessed_by=GetAttrGuardAccessor(is_torch_available)
| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, 139839683197424)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, accessed_by=GetAttrGuardAccessor(_torch_fx_available)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, 7685856)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available, accessed_by=GetAttrGuardAccessor(is_torch_fx_available)
| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, 139839683233376)
V0627 17:31:02.815000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "2/0", "frame_key": "7", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1970, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 39, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 5, "graph_input_count": 0, "start_time": 1719534661.536575, "entire_frame_compile_time_s": 1.2790420055389404, "backend_compile_time_s": 1.2300312519073486, "inductor_compile_time_s": 1.2066993713378906, "code_gen_time_s": 1.083174467086792, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.014907121658325195, "has_guarded_code": true}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.816000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 12, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 12}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 12, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 13, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 13}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 13, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.837000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:02.837000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "fce4fac5f9230c475246dd6dd52e1c05"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['logger'], accessed_by=DictGetItemGuardAccessor(logger)
| | | +- ID_MATCH: ___check_obj_id(G['logger'], 139839664782448)
V0627 17:31:02.837000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "3/0", "frame_key": "8", "co_name": "_pad_to_block_size", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2208, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 9, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534662.816827, "entire_frame_compile_time_s": 0.0205228328704834, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.007956266403198242, "has_guarded_code": true}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.838000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2226, "name": "_pad_to_block_size", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 14, "size": 6552}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 0, "source": "L['input_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.843000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 14, "size": 3276}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.844000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31f99710>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.844000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 2, "source": "L['attention_mask']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 14, "size": 32768}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "is_view": true, "stride": [4096, 1], "storage": 2, "base": 5, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ed9f15fd0>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 4, "source": "L['token_type_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.850000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_input_ids_": [1, 819], "l_attention_mask_": [1, 819], "l_token_type_ids_": [1, 819], "input_ids": [1, 832], "attention_mask": [1, 832], "token_type_ids": [1, 832]}}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "e7d86ff372082e962b35557ebd7308fc"}
class GraphModule(torch.nn.Module):
def forward(self, L_input_ids_: "i64[1, 819][819, 1]cpu", L_attention_mask_: "f32[1, 819][819, 1]cpu", L_token_type_ids_: "i64[1, 819][4096, 1]cpu"):
l_input_ids_ = L_input_ids_
l_attention_mask_ = L_attention_mask_
l_token_type_ids_ = L_token_type_ids_
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
input_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_input_ids_, (0, 13), 'constant', 0); l_input_ids_ = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
attention_mask: "f32[1, 832][832, 1]cpu" = torch._C._nn.pad(l_attention_mask_, (0, 13), 'constant', False); l_attention_mask_ = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
token_type_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_token_type_ids_, (0, 13), 'constant', 0); l_token_type_ids_ = None
return (input_ids, attention_mask, token_type_ids)
V0627 17:31:02.865000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None
return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0627 17:31:02.875000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None
return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0627 17:31:02.904000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/fj/cfjbx53t7urpjz2ng6piwvoj4vsjiqa2xrgjcrh4vovq4w45eywv.py"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "3758e3875a0e606fcec57aeffa852874"}
# AOT ID: ['2_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused_constant_pad_nd_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'int64_t*', 'float*', 'int64_t*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int64_t* in_ptr0,
const float* in_ptr1,
const int64_t* in_ptr2,
int64_t* out_ptr0,
float* out_ptr1,
int64_t* out_ptr2)
{
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
{
auto tmp0 = x0;
auto tmp1 = c10::convert<int32_t>(tmp0);
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
auto tmp3 = static_cast<int32_t>(819);
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
auto tmp6 = [&]
{
auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr0 + static_cast<long>(x0));
return tmp7;
}
;
auto tmp10 =
[&]
{
if (tmp5.all_zero())
{
return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
}
else
{
auto tmp8 = tmp6();
auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
}
}
()
;
tmp10.store(out_ptr0 + static_cast<long>(x0), 16);
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
{
auto tmp0 = x0;
auto tmp1 = c10::convert<int32_t>(tmp0);
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
auto tmp3 = static_cast<int32_t>(819);
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
auto tmp6 = [&]
{
auto tmp7 = tmp5.template cast<float,1>().template loadu<float,1>(in_ptr1 + static_cast<long>(x0));
return tmp7;
}
;
auto tmp10 =
[&]
{
if (tmp5.all_zero())
{
return at::vec::Vectorized<float>(static_cast<float>(0.0));
}
else
{
auto tmp8 = tmp6();
auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(0.0));
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<float,1>());
}
}
()
;
tmp10.store(out_ptr1 + static_cast<long>(x0));
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
{
auto tmp0 = x0;
auto tmp1 = c10::convert<int32_t>(tmp0);
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
auto tmp3 = static_cast<int32_t>(819);
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
auto tmp6 = [&]
{
auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr2 + static_cast<long>(x0));
return tmp7;
}
;
auto tmp10 =
[&]
{
if (tmp5.all_zero())
{
return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
}
else
{
auto tmp8 = tmp6();
auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
}
}
()
;
tmp10.store(out_ptr2 + static_cast<long>(x0), 16);
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1 = args
args.clear()
assert_size_stride(arg0_1, (1, 819), (819, 1))
assert_size_stride(arg1_1, (1, 819), (819, 1))
assert_size_stride(arg2_1, (1, 819), (4096, 1))
buf0 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
buf1 = empty_strided_cpu((1, 832), (832, 1), torch.float32)
buf2 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
cpp_fused_constant_pad_nd_0(arg0_1, arg1_1, arg2_1, buf0, buf1, buf2)
del arg0_1
del arg1_1
del arg2_1
return (buf0, buf1, buf2, )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.int64)
arg1_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.float32)
arg2_1 = rand_strided((1, 819), (4096, 1), device='cpu', dtype=torch.int64)
fn = lambda: call([arg0_1, arg1_1, arg2_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:02.910000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:02.911000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "ae71e2a61c1f7e9b1434b71d14d096e3"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
| +- GuardManager: source=L['padding_len'], accessed_by=DictGetItemGuardAccessor(padding_len)
| | +- EQUALS_MATCH: L['padding_len'] == 13
| +- GuardManager: source=L['pad_token_id'], accessed_by=DictGetItemGuardAccessor(pad_token_id)
| | +- EQUALS_MATCH: L['pad_token_id'] == 0
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- TENSOR_MATCH: check_tensor(L['attention_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['attention_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | +- TENSOR_MATCH: check_tensor(L['token_type_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[4096, 1])
| | +- NO_HASATTR: hasattr(L['token_type_ids'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn)
| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680)
| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional)
| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024)
| | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=GetAttrGuardAccessor(pad)
| | | | | +- GuardManager: source=G['nn'].functional.pad.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.pad.__code__, 139842439629440)
| | | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=FuncDefaultsGuardAccessor
| | | | | | +- GuardManager: source=G['nn'].functional.pad.__defaults__[0], accessed_by=GetItemGuardAccessor(0)
| | | | | | | +- EQUALS_MATCH: G['nn'].functional.pad.__defaults__[0] == 'constant'
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_functional)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'], 139842441627024)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn, accessed_by=GetAttrGuardAccessor(_nn)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn, 139842445377216)
| | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, accessed_by=GetAttrGuardAccessor(pad)
| | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, 139842445416928)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit, accessed_by=GetAttrGuardAccessor(jit)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit, 139842414949968)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, accessed_by=GetAttrGuardAccessor(is_scripting)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, 139842422983696)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, accessed_by=GetAttrGuardAccessor(are_deterministic_algorithms_enabled)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, 139842451619504)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, accessed_by=GetAttrGuardAccessor(has_torch_function_unary)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, 139845228559104)
V0627 17:31:02.911000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "4/0", "frame_key": "9", "co_name": "torch_dynamo_resume_in__pad_to_block_size_at_2226", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2226, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 26, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534662.838091, "entire_frame_compile_time_s": 0.07323813438415527, "backend_compile_time_s": 0.05719876289367676, "inductor_compile_time_s": 0.03380870819091797, "code_gen_time_s": 0.027545690536499023, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.912000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 16, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.916000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.916000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 16, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 16, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.997000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 17, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 17, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 3, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.027000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_1_": [1, 832], "l_stack0_2_": [1, 832], "l_stack0_3_": [1, 832], "blocked_encoder_mask": [1, 13, 64], "getitem": [1, 9, 64], "getitem_1": [1, 9, 64], "getitem_2": [1, 9, 64], "exp_blocked_to_pad": [1, 9, 192], "getitem_3": [1, 9, 64], "band_mask": [1, 1, 9, 64, 192], "unsqueeze_": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832], "l__self___embeddings_position_ids": [1, 4096], "position_ids": [1, 832], "inputs_embeds": [1, 832, 768], "token_type_embeddings": [1, 832, 768], "embeddings": [1, 832, 768], "position_embeddings": [1, 832, 768], "embeddings_1": [1, 832, 768], "embeddings_2": [1, 832, 768], "embeddings_3": [1, 832, 768]}}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "5bf8fff16cea4127a0a6b6a6800ef31a"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_1_: "i64[1, 832][832, 1]cpu", L_stack0_2_: "f32[1, 832][832, 1]cpu", L_stack0_3_: "i64[1, 832][832, 1]cpu"):
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
blocked_encoder_mask: "f32[1, 13, 64][832, 64, 1]cpu" = l_stack0_2_.view(1, 13, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
getitem: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(1, -3, None))]
getitem_1: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))]
getitem_2: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(3, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
exp_blocked_to_pad: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.cat([getitem, getitem_1, getitem_2], dim = 2); getitem = getitem_1 = getitem_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
getitem_3: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))]
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.functional.einsum('blq,blk->blqk', getitem_3, exp_blocked_to_pad); getitem_3 = exp_blocked_to_pad = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
unsqueeze_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = band_mask.unsqueeze_(1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_stack0_2_.view(1, 1, 832, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_stack0_2_.view(1, 1, 1, 832); l_stack0_2_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:282 in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
l__self___embeddings_position_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_position_ids
position_ids: "i64[1, 832][4096, 1]cpu" = l__self___embeddings_position_ids[(slice(None, None, None), slice(0, 832, None))]; l__self___embeddings_position_ids = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
inputs_embeds: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_word_embeddings(l_stack0_1_); l_stack0_1_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
token_type_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_token_type_embeddings(l_stack0_3_); l_stack0_3_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = inputs_embeds + token_type_embeddings; inputs_embeds = token_type_embeddings = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
position_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_position_embeddings(position_ids); position_ids = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
embeddings += position_embeddings; embeddings_1: "f32[1, 832, 768][638976, 768, 1]cpu" = embeddings; embeddings = position_embeddings = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:308 in forward, code: embeddings = self.dropout(embeddings)
embeddings_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_dropout(embeddings_1); embeddings_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
embeddings_3: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_LayerNorm(embeddings_2); embeddings_2 = None
return (embeddings_3, band_mask, from_mask, to_mask, blocked_encoder_mask)
V0627 17:31:03.091000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f7b6ff7875cdbc7ff1ea7b5f6bc39ed2"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[50358, 768][768, 1]cpu", arg1_1: "f32[2, 768][768, 1]cpu", arg2_1: "f32[4096, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "i64[1, 4096][4096, 1]cpu", arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 13, 64])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -3); slice_1 = None
slice_3: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 2, -2); slice_3 = None
slice_5: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 3, -1); slice_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2); slice_2 = slice_4 = slice_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
slice_7: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_8: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 2, -2); slice_7 = None
unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_8, 3); slice_8 = None
permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None
unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None
permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None
mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 832, 1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:282 in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
slice_9: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 0, 9223372036854775807); arg5_1 = None
slice_10: "i64[1, 832][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 832); slice_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg0_1, arg6_1, 0); arg0_1 = arg6_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg1_1, arg8_1); arg1_1 = arg8_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
embedding_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg2_1, slice_10); arg2_1 = slice_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, embedding_2); add = embedding_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:308 in forward, code: embeddings = self.dropout(embeddings)
clone: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(add_1); add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
var_mean = torch.ops.aten.var_mean.correction(clone, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2); add_2 = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(clone, getitem_1); clone = getitem_1 = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, arg3_1); mul_1 = arg3_1 = None
add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, arg4_1); mul_2 = arg4_1 = None
return (add_3, unsqueeze_2, view_1, view_2, view)
V0627 17:31:03.161000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "06e74f82fcfa9d791dc26355727799db"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
# No stacktrace found for following nodes
_frozen_param0: "f32[50358, 768][768, 1]cpu" = self._frozen_param0
_frozen_param1: "f32[2, 768][768, 1]cpu" = self._frozen_param1
_frozen_param3: "f32[768][1]cpu" = self._frozen_param3
_frozen_param4: "f32[768][1]cpu" = self._frozen_param4
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
_frozen_param6: "f32[1, 832, 768][638976, 768, 1]cpu" = self._frozen_param6
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0); _frozen_param0 = arg6_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1); _frozen_param1 = arg8_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, _frozen_param6); add = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_1, getitem_1); add_1 = getitem_1 = None
add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2); add_2 = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None
add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4); mul_2 = _frozen_param4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 2, -2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_4, 3)
permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 1, -3)
slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 3, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2); slice_2 = slice_4 = slice_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None
permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None
mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None
return (add_3, unsqueeze_2, view_1, view_2, view)
V0627 17:31:03.519000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/de/cdewao76edq6vrvflsagsrjktsdjwfpzvsaaft6tyecuomopfso3.py"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f30f2b373864eaff49baf96db8ab8cb7"}
# AOT ID: ['3_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param0 = None # device(type='cpu') torch.float32 (50358, 768) (768, 1) 7f2eb1d44630
_frozen_param1 = None # device(type='cpu') torch.float32 (2, 768) (768, 1) 7f2eb1d445e0
_frozen_param3 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44540
_frozen_param4 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44810
_frozen_param6 = None # device(type='cpu') torch.float32 (1, 832, 768) (638976, 768, 1) 7f2e3165ccc0
cpp_fused_add_cat_embedding_mul_native_layer_norm_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int64_t* in_ptr0,
const float* in_ptr1,
const int64_t* in_ptr2,
const float* in_ptr3,
const float* in_ptr4,
const float* in_ptr5,
const float* in_ptr6,
const float* in_ptr7,
const float* in_ptr8,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
float* out_ptr7)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
{
{
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp10 = in_ptr2[static_cast<long>(x0)];
auto tmp21 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = 50358L;
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = tmp0 < 0;
auto tmp5 = tmp4 ? tmp3 : tmp0;
auto tmp6 = tmp5;
auto tmp7 = c10::convert<int64_t>(tmp6);
TORCH_CHECK((0 <= tmp7) & (tmp7 < 50358L), "index out of bounds: 0 <= tmp7 < 50358L");
auto tmp9 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*tmp5)), 16);
auto tmp11 = 2L;
auto tmp12 = c10::convert<int64_t>(tmp11);
auto tmp13 = decltype(tmp10)(tmp10 + tmp12);
auto tmp14 = tmp10 < 0;
auto tmp15 = tmp14 ? tmp13 : tmp10;
auto tmp16 = tmp15;
auto tmp17 = c10::convert<int64_t>(tmp16);
TORCH_CHECK((0 <= tmp17) & (tmp17 < 2L), "index out of bounds: 0 <= tmp17 < 2L");
auto tmp19 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (768L*tmp15)), 16);
auto tmp20 = tmp9 + tmp19;
auto tmp22 = tmp20 + tmp21;
tmp22.store(out_ptr0 + static_cast<long>(x1 + (768L*x0)));
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp22, &weight_recps);
}
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = out_ptr1[static_cast<long>(x0)];
auto tmp4 = out_ptr2[static_cast<long>(x0)];
auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1), 16);
auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr6 + static_cast<long>(x1), 16);
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 - tmp2;
auto tmp5 = static_cast<float>(768.0);
auto tmp6 = tmp4 / tmp5;
auto tmp7 = static_cast<float>(1e-12);
auto tmp8 = decltype(tmp6)(tmp6 + tmp7);
auto tmp9 = 1 / std::sqrt(tmp8);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp3 * tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp15 = tmp13 + tmp14;
tmp15.store(out_ptr3 + static_cast<long>(x1 + (768L*x0)));
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(64L + x1 + (64L*x0)), 16);
tmp0.store(out_ptr4 + static_cast<long>(x1 + (192L*x0)));
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(128L + x1 + (64L*x0)), 16);
tmp0.store(out_ptr5 + static_cast<long>(x1 + (192L*x0)));
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(192L + x1 + (64L*x0)), 16);
tmp0.store(out_ptr6 + static_cast<long>(x1 + (192L*x0)));
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
{
auto tmp0 = in_ptr7[static_cast<long>(128L + x1 + (64L*x0))];
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr8 + static_cast<long>(x2 + (192L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>(tmp0);
auto tmp3 = tmp2 * tmp1;
tmp3.store(out_ptr7 + static_cast<long>(x2 + (192L*x1) + (12288L*x0)));
}
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg6_1, arg7_1, arg8_1 = args
args.clear()
assert_size_stride(arg6_1, (1, 832), (832, 1))
assert_size_stride(arg7_1, (1, 832), (832, 1))
assert_size_stride(arg8_1, (1, 832), (832, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
buf1 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf4 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
buf8 = empty_strided_cpu((1, 9, 192), (1728, 192, 1), torch.float32)
buf5 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 0) # alias
buf6 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 64) # alias
buf7 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 128) # alias
buf9 = empty_strided_cpu((1, 9, 64, 192), (110592, 12288, 192, 1), torch.float32)
cpp_fused_add_cat_embedding_mul_native_layer_norm_0(arg6_1, _frozen_param0, arg8_1, _frozen_param1, _frozen_param6, _frozen_param3, _frozen_param4, arg7_1, buf8, buf0, buf1, buf2, buf4, buf5, buf6, buf7, buf9)
del arg6_1
del arg8_1
return (buf4, reinterpret_tensor(buf9, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 832, 1), (832, 832, 1, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 1, 832), (832, 832, 832, 1), 0), reinterpret_tensor(arg7_1, (1, 13, 64), (832, 64, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param0
_frozen_param0 = rand_strided((50358, 768), (768, 1), device='cpu', dtype=torch.float32)
global _frozen_param1
_frozen_param1 = rand_strided((2, 768), (768, 1), device='cpu', dtype=torch.float32)
global _frozen_param3
_frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param4
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param6
_frozen_param6 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
arg6_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
arg7_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.float32)
arg8_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
fn = lambda: call([arg6_1, arg7_1, arg8_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:03.542000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:03.542000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "1dfadefa57d2d698b82df0a252ee757b"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].encoder, accessed_by=DictGetItemGuardAccessor(encoder)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].encoder, 139839713378016)
| | | | | +- GuardManager: source=L['self'].encoder.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].encoder.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].encoder.training, 7685824)
| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208)
| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].embeddings.__dict__)
| | | | | | +- GuardManager: source=L['self'].embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].embeddings.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.dropout, 139839202278704)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.dropout.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.dropout.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.LayerNorm, 139839202278800)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.LayerNorm.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings, accessed_by=DictGetItemGuardAccessor(word_embeddings)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.word_embeddings, 139839202271840)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.word_embeddings.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings, accessed_by=DictGetItemGuardAccessor(position_embeddings)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_embeddings, 139839202279184)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_embeddings.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings, accessed_by=DictGetItemGuardAccessor(token_type_embeddings)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_embeddings, 139839202279328)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_embeddings.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers)
| | | | | | | +- GuardManager: source=L['self'].embeddings.position_ids, accessed_by=DictGetItemGuardAccessor(position_ids)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_ids, 139838528701040)
| | | | | | +- GuardManager: source=L['self'].embeddings._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].embeddings._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].embeddings._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].embeddings.rescale_embeddings, accessed_by=DictGetItemGuardAccessor(rescale_embeddings)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.rescale_embeddings, 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
| | | | +- EQUALS_MATCH: L['self'].block_size == 64
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 6
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- EQUALS_MATCH: L['___stack0'][0] == 13
| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][1], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][1], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
| | +- GuardManager: source=L['___stack0'][2], accessed_by=TupleGetItemGuardAccessor(2)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][2], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][2], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
| | +- GuardManager: source=L['___stack0'][3], accessed_by=TupleGetItemGuardAccessor(3)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][3], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][3], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
| | +- GuardManager: source=L['___stack0'][4], accessed_by=TupleGetItemGuardAccessor(4)
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][4], 7636800)
| | +- GuardManager: source=L['___stack0'][5], accessed_by=TupleGetItemGuardAccessor(5)
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][5], 7636800)
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['use_cache'], accessed_by=DictGetItemGuardAccessor(use_cache)
| | +- ID_MATCH: ___check_obj_id(L['use_cache'], 7685824)
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856)
| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values)
| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7685824)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['past_key_values_length'], accessed_by=DictGetItemGuardAccessor(past_key_values_length)
| | +- EQUALS_MATCH: L['past_key_values_length'] == 0
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672)
| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568)
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks
V0627 17:31:03.543000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "5/0", "frame_key": "10", "co_name": "torch_dynamo_resume_in_forward_at_2077", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2077, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 57, "shape_env_guard_count": 0, "graph_op_count": 18, "graph_node_count": 23, "graph_input_count": 3, "start_time": 1719534662.9121282, "entire_frame_compile_time_s": 0.6307895183563232, "backend_compile_time_s": 0.49609994888305664, "inductor_compile_time_s": 0.37875938415527344, "code_gen_time_s": 0.3245351314544678, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.08347654342651367, "has_guarded_code": true}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.544000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 19, "size": 442368}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 0, "source": "L['band_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 19, "size": 2555904}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 1, "source": "L['hidden_states']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.567000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 19, "size": 3328}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 2, "source": "L['from_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.570000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.570000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 5, "source": "L['to_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.600000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "6/0", "frame_key": "11", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1578, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534663.5449224, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.05504441261291504, "has_guarded_code": false}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.600000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.607000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 20, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.608000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.608000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 20, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 20, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.611000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.612000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.612000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 21, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 21, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 21, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.646000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.647000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.648000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.648000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.656000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:03.656000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "76aa2c3aac969b0b973556e5e5d20d8b"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202275632)
| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].attention.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention.training, 7685824)
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value)
| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask)
| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)
V0627 17:31:03.657000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "7/0", "frame_key": "12", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534663.6010149, "entire_frame_compile_time_s": 0.05594229698181152, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.039438724517822266, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.657000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 22, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 22, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.661000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 22, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.664000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.664000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 23, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 23, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.696000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 23, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.696000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.697000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.697000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.698000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.699000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.701000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.701000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.704000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"}
class GraphModule(torch.nn.Module):
def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
l_to_mask_ = L_to_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1383 in forward, code: band_mask = band_mask.to(hidden_states.dtype)
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = l_band_mask_.to(torch.float32); l_band_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1385 in forward, code: from_mask = from_mask.to(hidden_states.dtype)
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_from_mask_.to(torch.float32); l_from_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1387 in forward, code: to_mask = to_mask.to(hidden_states.dtype)
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_to_mask_.to(torch.float32); l_to_mask_ = None
return (band_mask, from_mask, to_mask)
V0627 17:31:03.718000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
return (arg0_1, arg1_1, arg2_1)
V0627 17:31:03.731000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:03.731000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "6f76e9e822f6dc2ebb0dbc0f0100927d"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202274384)
| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].self.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].self.training, 7685824)
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:03.732000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "8/0", "frame_key": "13", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534663.657894, "entire_frame_compile_time_s": 0.07398724555969238, "backend_compile_time_s": 0.02206587791442871, "inductor_compile_time_s": 0.0003921985626220703, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03484821319580078, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.732000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 25, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 25}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 25, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 26, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 26, "size": 442368}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 26, "size": 3328}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.765000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.765000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.766000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.766000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.768000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"}
class GraphModule(torch.nn.Module):
def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_hidden_states_ = L_hidden_states_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64); l__self___query = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3); x = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64); l__self___key = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3); x_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_); l_hidden_states_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64); l__self___value = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3); x_2 = None
return (query_layer, key_layer, value_layer)
V0627 17:31:03.817000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]); view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None
convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None
convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]); convert_element_type_8 = None
permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2); convert_element_type_6 = view_3 = permute_2 = None
view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]); view_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16); arg5_1 = None
convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]); convert_element_type_14 = None
permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]); convert_element_type_13 = None
addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4); convert_element_type_12 = view_6 = permute_4 = None
view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]); addmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]); view_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:03.886000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
_frozen_param6: "bf16[768][1]cpu" = self._frozen_param6
# No stacktrace found for following nodes
_frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
# No stacktrace found for following nodes
_frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_frozen_param10: "bf16[768][1]cpu" = self._frozen_param10
# No stacktrace found for following nodes
_frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
_linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:03.909000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/rm/crmmdl3pvsdue2ht6qffev3qnvhhdsc4zixorhqtjreztfur5zhi.py"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "a40072c55bb96853547fea577aa47ba2"}
# AOT ID: ['5_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311efab0
_frozen_param12 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311c2750
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311b8770
_frozen_param13 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311cefc0
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311dbbf0
_frozen_param14 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311ac090
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0);
tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg6_1, = args
args.clear()
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg6_1, buf0)
del arg6_1
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param6
_frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param12
_frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param8
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param13
_frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param10
_frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param14
_frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg6_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:03.922000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:03.923000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "300400f770725170203fcbe28e6ee223"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202273568)
| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].key.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].key.training, 7685824)
| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202273616)
| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].query.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].query.training, 7685824)
| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202273040)
| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].value.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].value.training, 7685824)
| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed)
| | | | +- EQUALS_MATCH: L['self'].seed == 0
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
| | | | +- EQUALS_MATCH: L['self'].block_size == 64
| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks)
| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3
| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64
| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads)
| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:03.923000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "9/0", "frame_key": "14", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534663.7326608, "entire_frame_compile_time_s": 0.19047832489013672, "backend_compile_time_s": 0.14537477493286133, "inductor_compile_time_s": 0.0376286506652832, "code_gen_time_s": 0.016646862030029297, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.018494129180908203, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.925000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.985000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:03.986000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "f6f5661bfad0dc293ecc9ef35ede39a0"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed)
| | +- EQUALS_MATCH: L['seed'] == 0
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | +- EQUALS_MATCH: L['attention_head_size'] == 64
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464)
| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed)
| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264)
| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math)
| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744)
| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344)
V0627 17:31:03.986000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "10/0", "frame_key": "15", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 516, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.9255476, "entire_frame_compile_time_s": 0.06077218055725098, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.008862972259521484, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.987000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.076000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:04.077000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "f3df28d4d21dab674ac56179543067e7"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
| | +- EQUALS_MATCH: L['n_heads'] == 12
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___37'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___37)
| | | +- GuardManager: source=G['__builtins_dict___37']['int'], accessed_by=DictGetItemGuardAccessor(int)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___37']['int'], 7648640)
V0627 17:31:04.077000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "11/0", "frame_key": "16", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.987453, "entire_frame_compile_time_s": 0.08972334861755371, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.02129364013671875, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:04.078000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 12, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.094000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"wrapped_array": [2], "plan_block_length": [2]}}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "9f82d9593d608d32ba61e6298aeb3649"}
class GraphModule(torch.nn.Module):
def forward(self):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
wrapped_array: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_array([704, 832])
plan_block_length: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_floordiv(wrapped_array, 64); wrapped_array = None
return (plan_block_length,)
V0627 17:31:04.106000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "1861734e40a5f61860344b326195085c"}
class <lambda>(torch.nn.Module):
def forward(self):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
_tensor_constant0 = self._tensor_constant0
lift_fresh_copy: "i64[2][1]cpu" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0); _tensor_constant0 = None
clone: "i64[2][1]cpu" = torch.ops.aten.clone.default(lift_fresh_copy); lift_fresh_copy = None
div: "i64[2][1]cpu" = torch.ops.aten.div.Tensor_mode(clone, 64, rounding_mode = 'floor'); clone = None
return (div,)
V0627 17:31:04.122000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:04.122000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "c1fc05dff62c1bc070ea06a12430d940"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
| | +- EQUALS_MATCH: L['num_heads'] == 12
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['to_seq_length'], accessed_by=DictGetItemGuardAccessor(to_seq_length)
| | +- EQUALS_MATCH: L['to_seq_length'] == 832
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length)
| | +- EQUALS_MATCH: L['from_seq_length'] == 832
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
| | +- TYPE_MATCH: ___check_type_id(L['plan_from_length'], 7650400)
| | +- LENGTH_CHECK: len(L['plan_from_length']) == 2
| | +- GuardManager: source=L['plan_from_length'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- EQUALS_MATCH: L['plan_from_length'][0] == 704
| | +- GuardManager: source=L['plan_from_length'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- EQUALS_MATCH: L['plan_from_length'][1] == 832
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].array, accessed_by=GetAttrGuardAccessor(array)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].array, 139845228959664)
| | +- GuardManager: source=G['__builtins_dict___40'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___40)
| | | +- GuardManager: source=G['__builtins_dict___40']['list'], accessed_by=DictGetItemGuardAccessor(list)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['list'], 7650400)
| | | +- GuardManager: source=G['__builtins_dict___40']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['range'], 7632448)
| | | +- GuardManager: source=G['__builtins_dict___40']['enumerate'], accessed_by=DictGetItemGuardAccessor(enumerate)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['enumerate'], 7513024)
| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336)
V0627 17:31:04.122000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "12/0", "frame_key": "17", "co_name": "_bigbird_block_rand_mask_with_head", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1111, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534664.0783854, "entire_frame_compile_time_s": 0.04439258575439453, "backend_compile_time_s": 0.01859426498413086, "inductor_compile_time_s": 0.00021767616271972656, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.00835871696472168, "has_guarded_code": true}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:04.123000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.127000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "13/0", "frame_key": "18", "co_name": "<listcomp>", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534664.1237168, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0032224655151367188, "has_guarded_code": false}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.127000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dafc0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c3510>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 3, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db3dd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 4, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1fd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 5, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311daf70>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 6, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.139000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.139000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311b8ea0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.140000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 7, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ba200>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 8, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119dd50>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 9, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c6d0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 10, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.145000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.146000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c4a0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.146000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 11, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 12, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 12, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.148000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.149000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4df30>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.149000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 13, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.153000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None))); l_stack0_0_ = None
wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None))); l_stack0_1_ = None
wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None))); l_stack0_2_ = None
wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None))); l_stack0_3_ = None
wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None))); l_stack0_4_ = None
wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None))); l_stack0_5_ = None
wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None))); l_stack0_6_ = None
wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None))); l_stack0_7_ = None
wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None))); l_stack0_8_ = None
wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None))); l_stack0_9_ = None
wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None))); l_stack0_10_ = None
wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None))); l_stack0_11_ = None
return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11)
V0627 17:31:04.205000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807); slice_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None
return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24)
V0627 17:31:04.236000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0627 17:31:04.250000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ki/ckiwvz5uks7efnnd5ew4d76276bcutmxkkfh6ozi5dgumqv3m2qc.py"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "6641116284eedbc64e23effbbbfe40e6"}
# AOT ID: ['7_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
args.clear()
assert_size_stride(arg0_1, (13, 3), (3, 1))
assert_size_stride(arg1_1, (13, 3), (3, 1))
assert_size_stride(arg2_1, (13, 3), (3, 1))
assert_size_stride(arg3_1, (13, 3), (3, 1))
assert_size_stride(arg4_1, (13, 3), (3, 1))
assert_size_stride(arg5_1, (13, 3), (3, 1))
assert_size_stride(arg6_1, (13, 3), (3, 1))
assert_size_stride(arg7_1, (13, 3), (3, 1))
assert_size_stride(arg8_1, (13, 3), (3, 1))
assert_size_stride(arg9_1, (13, 3), (3, 1))
assert_size_stride(arg10_1, (13, 3), (3, 1))
assert_size_stride(arg11_1, (13, 3), (3, 1))
return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:04.258000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:04.259000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "bb9e1aaf4decc7f300fbb51ff6f34967"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
| | +- LENGTH_CHECK: len(L['___stack0']) == 12
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
| | +- EQUALS_MATCH: L['num_heads'] == 12
| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks)
| | +- EQUALS_MATCH: L['num_blocks'] == 13
| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top)
| | +- EQUALS_MATCH: L['global_block_top'] == 1
| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom)
| | +- EQUALS_MATCH: L['global_block_bottom'] == 1
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___44'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___44)
| | | +- GuardManager: source=G['__builtins_dict___44']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___44']['range'], 7632448)
V0627 17:31:04.259000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "14/0", "frame_key": "19", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534664.1278515, "entire_frame_compile_time_s": 0.13145732879638672, "backend_compile_time_s": 0.09916210174560547, "inductor_compile_time_s": 0.022524356842041016, "code_gen_time_s": 0.003596067428588867, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.260000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b7060>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ad1c0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c810>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119ef70>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c58a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1d00>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b9f30>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ac310>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310bb100>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4c450>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310f5e90>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 12, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c6840>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 13, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.283000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 13, "describer_id": 38, "size": 3328}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.283000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 17, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.284000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.284000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 16, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.293000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 14, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.294000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 31, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c73d0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.294000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 31, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 15, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 32, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c7470>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 32, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.316000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 50, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.317000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 50, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 16, "describer_id": 38, "size": 442368}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 124, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 124, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.439000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 182, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.439000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 182, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.456000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
l_query_layer_ = L_query_layer_
l_from_blocked_mask_ = L_from_blocked_mask_
l_key_layer_ = L_key_layer_
l_value_layer_ = L_value_layer_
l_to_mask_ = L_to_mask_
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0); l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64); rand_attn = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0)
rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.cat([rand_attn_1], dim = 0); rand_attn_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0]
i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
flatten: "i64[396][1]cpu" = i1.flatten(); i1 = None
getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten]; p1 = flatten = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]); getitem_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192); rand_mask = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
getitem_3: "f32[1, 11, 64][832, 64, 1]cpu" = l_from_blocked_mask_[(slice(None, None, None), slice(1, -1, None))]; l_from_blocked_mask_ = None
rand_mask_2: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.functional.einsum('blq,bhlk->bhlqk', getitem_3, rand_mask_1); getitem_3 = rand_mask_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
blocked_query_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_query_layer_.view(1, 12, 13, 64, -1); l_query_layer_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
blocked_key_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_key_layer_.view(1, 12, 13, 64, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
blocked_value_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_value_layer_.view(1, 12, 13, 64, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
shift: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu'))
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div: "i64[396][1]cpu" = torch.div(shift, 33, rounding_mode = 'floor'); shift = None
indices_shift: "i64[396][1]cpu" = div * 13; div = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_4: "i64[396][1]cpu" = rand_attn_2.view(-1)
flattened_indices: "i64[396][1]cpu" = view_4 + indices_shift; view_4 = indices_shift = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
flattened_params: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_key_matrix.reshape(-1, 64, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
out_flattened: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params.index_select(0, flattened_indices); flattened_params = flattened_indices = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
out: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened.reshape((1, 12, 33, 64, 64)); out_flattened = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
gathered_key: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out.view(1, 12, 11, 192, -1); out = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
shift_1: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu'))
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div_1: "i64[396][1]cpu" = torch.div(shift_1, 33, rounding_mode = 'floor'); shift_1 = None
indices_shift_1: "i64[396][1]cpu" = div_1 * 13; div_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_6: "i64[396][1]cpu" = rand_attn_2.view(-1)
flattened_indices_1: "i64[396][1]cpu" = view_6 + indices_shift_1; view_6 = indices_shift_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
flattened_params_1: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_value_matrix.reshape(-1, 64, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
out_flattened_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params_1.index_select(0, flattened_indices_1); flattened_params_1 = flattened_indices_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
out_1: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened_1.reshape((1, 12, 33, 64, 64)); out_flattened_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
gathered_value: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out_1.view(1, 12, 11, 192, -1); out_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
getitem_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_4: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_4.reshape((-1, 64, 64)); getitem_4 = None
reshape_5: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64))
transpose: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_5.transpose(1, 2); reshape_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_4, transpose); reshape_4 = transpose = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
first_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm.view((1, 12, 64, 832)); bmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
first_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product * 0.125; first_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_
mul_3: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub * -10000.0; sub = None
first_product_1 += mul_3; first_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product_1; first_product_1 = mul_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
first_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(first_product_2, dim = -1); first_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_6: "bf16[12, 64, 832][53248, 832, 1]cpu" = first_attn_weights.reshape((-1, 64, 832)); first_attn_weights = None
reshape_7: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64))
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_6, reshape_7); reshape_6 = reshape_7 = None
first_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_1.view((1, 12, 64, 64)); bmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
unsqueeze__1: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = first_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
getitem_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
getitem_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
getitem_7: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
getitem_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
getitem_9: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
second_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_5, getitem_6, getitem_7, getitem_8, getitem_9], dim = 2); getitem_5 = getitem_6 = getitem_7 = getitem_8 = getitem_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
getitem_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
getitem_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
getitem_12: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
getitem_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
getitem_14: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
second_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_10, getitem_11, getitem_12, getitem_13, getitem_14], dim = 2); getitem_10 = getitem_11 = getitem_12 = getitem_13 = getitem_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
getitem_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_8: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_15.reshape((-1, 64, 64)); getitem_15 = None
reshape_9: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_key_mat.reshape((-1, 448, 64)); second_key_mat = None
transpose_1: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_9.transpose(1, 2); reshape_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_8, transpose_1); reshape_8 = transpose_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
second_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_2.view((1, 12, 64, 448)); bmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
getitem_16: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 192, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
getitem_17: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
new_ones: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
second_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.cat([getitem_16, getitem_17, new_ones], dim = 3); getitem_16 = getitem_17 = new_ones = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
new_ones_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
getitem_18: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
second_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.cat([new_ones_1, getitem_18], dim = 3); new_ones_1 = getitem_18 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
second_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product * 0.125; second_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_seq_pad, second_rand_pad); second_seq_pad = second_rand_pad = None
sub_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum; minimum = None
mul_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_1 * -10000.0; sub_1 = None
second_product_1 += mul_5; second_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product_1; second_product_1 = mul_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
second_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_product_2, dim = -1); second_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_10: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_attn_weights.reshape((-1, 64, 448)); second_attn_weights = None
reshape_11: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_value_mat.reshape((-1, 448, 64)); second_value_mat = None
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_10, reshape_11); reshape_10 = reshape_11 = None
second_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_3.view((1, 12, 64, 64)); bmm_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
unsqueeze__2: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
getitem_19: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))]
getitem_20: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
getitem_21: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
exp_blocked_key_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.cat([getitem_19, getitem_20, getitem_21], dim = 3); getitem_19 = getitem_20 = getitem_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
getitem_22: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))]
getitem_23: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
getitem_24: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
exp_blocked_value_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.cat([getitem_22, getitem_23, getitem_24], dim = 3); getitem_22 = getitem_23 = getitem_24 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
middle_query_matrix: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_12: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64))
reshape_13: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_key_matrix.reshape((-1, 192, 64)); exp_blocked_key_matrix = None
transpose_2: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_13.transpose(1, 2); reshape_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_12, transpose_2); reshape_12 = transpose_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
inner_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_4.view((1, 12, 9, 64, 192)); bmm_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
inner_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product * 0.125; inner_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
getitem_26: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_14: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64))
reshape_15: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_26.reshape((-1, 192, 64)); getitem_26 = None
transpose_3: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_15.transpose(1, 2); reshape_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_14, transpose_3); reshape_14 = transpose_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
rand_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_5.view((1, 12, 9, 64, 192)); bmm_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
rand_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product * 0.125; rand_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:718 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]
getitem_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
first_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_27); getitem_27 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
first_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product * 0.125; first_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:724 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]
getitem_28: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
last_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_28); middle_query_matrix = getitem_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
last_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product * 0.125; last_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
sub_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = 1.0 - l_band_mask_; l_band_mask_ = None
mul_10: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = sub_2 * -10000.0; sub_2 = None
inner_band_product_1 += mul_10; inner_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product_1; inner_band_product_1 = mul_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
getitem_29: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
unsqueeze: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_29.unsqueeze(3); getitem_29 = None
sub_3: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze; unsqueeze = None
mul_11: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_3 * -10000.0; sub_3 = None
first_band_product_1 += mul_11; first_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product_1; first_band_product_1 = mul_11 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
getitem_30: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]
unsqueeze_1: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_30.unsqueeze(3); getitem_30 = None
sub_4: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze_1; unsqueeze_1 = None
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_4 * -10000.0; sub_4 = None
last_band_product_1 += mul_12; last_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product_1; last_band_product_1 = mul_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
getitem_31: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]
sub_5: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = 1.0 - getitem_31; getitem_31 = None
mul_13: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = sub_5 * -10000.0; sub_5 = None
rand_band_product_1 += mul_13; rand_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product_1; rand_band_product_1 = mul_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
band_product: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.cat([first_band_product_2, inner_band_product_2, rand_band_product_2, last_band_product_2], dim = -1); first_band_product_2 = inner_band_product_2 = rand_band_product_2 = last_band_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
attn_weights: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.nn.functional.softmax(band_product, dim = -1); band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
getitem_32: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(64, 256, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_16: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_32.reshape((-1, 64, 192)); getitem_32 = None
reshape_17: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_value_matrix.reshape((-1, 192, 64)); exp_blocked_value_matrix = None
bmm_6: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_16, reshape_17); reshape_16 = reshape_17 = None
context_layer: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_6.view((1, 12, 9, 64, 64)); bmm_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
getitem_33: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(256, -64, None))]
getitem_34: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_18: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_33.reshape((-1, 64, 192)); getitem_33 = None
reshape_19: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_34.reshape((-1, 192, 64)); getitem_34 = None
bmm_7: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_18, reshape_19); reshape_18 = reshape_19 = None
view_15: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_7.view((1, 12, 9, 64, 64)); bmm_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
context_layer += view_15; context_layer_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer; context_layer = view_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
getitem_35: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
getitem_36: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
einsum_3: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_35, getitem_36); getitem_35 = getitem_36 = None
context_layer_1 += einsum_3; context_layer_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_1; context_layer_1 = einsum_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
getitem_37: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]; attn_weights = None
getitem_38: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
einsum_4: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_37, getitem_38); getitem_37 = getitem_38 = None
context_layer_2 += einsum_4; context_layer_3: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_2; context_layer_2 = einsum_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:775 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
getitem_39: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
getitem_40: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -3)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
getitem_41: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:778 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
getitem_42: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_key_matrix = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
getitem_43: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), -1)]; gathered_key = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
second_last_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_39, getitem_40, getitem_41, getitem_42, getitem_43], dim = 2); getitem_39 = getitem_40 = getitem_41 = getitem_42 = getitem_43 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:785 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
getitem_44: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
getitem_45: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -3)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
getitem_46: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:788 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
getitem_47: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_value_matrix = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
getitem_48: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), -1)]; gathered_value = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
second_last_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_44, getitem_45, getitem_46, getitem_47, getitem_48], dim = 2); getitem_44 = getitem_45 = getitem_46 = getitem_47 = getitem_48 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
getitem_49: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_20: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_49.reshape((-1, 64, 64)); getitem_49 = None
reshape_21: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_key_mat.reshape((-1, 448, 64)); second_last_key_mat = None
transpose_4: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_21.transpose(1, 2); reshape_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_8: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_20, transpose_4); reshape_20 = transpose_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
second_last_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_8.view((1, 12, 64, 448)); bmm_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:798 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size],
getitem_50: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
getitem_51: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-192, None, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:800 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
new_ones_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
second_last_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.cat([getitem_50, getitem_51, new_ones_2], dim = 3); getitem_50 = getitem_51 = new_ones_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:806 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
new_ones_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
getitem_52: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), -1)]; rand_mask_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
second_last_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.cat([new_ones_3, getitem_52], dim = 3); new_ones_3 = getitem_52 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
second_last_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product * 0.125; second_last_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_last_seq_pad, second_last_rand_pad); second_last_seq_pad = second_last_rand_pad = None
sub_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum_1; minimum_1 = None
mul_15: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_6 * -10000.0; sub_6 = None
second_last_product_1 += mul_15; second_last_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product_1; second_last_product_1 = mul_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
second_last_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_last_product_2, dim = -1); second_last_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_22: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_last_attn_weights.reshape((-1, 64, 448)); second_last_attn_weights = None
reshape_23: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_value_mat.reshape((-1, 448, 64)); second_last_value_mat = None
bmm_9: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_22, reshape_23); reshape_22 = reshape_23 = None
second_last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_9.view((1, 12, 64, 64)); bmm_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
unsqueeze__3: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_last_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
getitem_53: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_query_matrix = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_24: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_53.reshape((-1, 64, 64)); getitem_53 = None
reshape_25: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64)); l_key_layer_ = None
transpose_5: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_25.transpose(1, 2); reshape_25 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_10: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_24, transpose_5); reshape_24 = transpose_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
last_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm_10.view((1, 12, 64, 832)); bmm_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
last_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product * 0.125; last_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
sub_7: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_; l_to_mask_ = None
mul_17: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub_7 * -10000.0; sub_7 = None
last_product_1 += mul_17; last_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product_1; last_product_1 = mul_17 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
last_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(last_product_2, dim = -1); last_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_26: "bf16[12, 64, 832][53248, 832, 1]cpu" = last_attn_weights.reshape((-1, 64, 832)); last_attn_weights = None
reshape_27: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64)); l_value_layer_ = None
bmm_11: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_26, reshape_27); reshape_26 = reshape_27 = None
last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_11.view((1, 12, 64, 64)); bmm_11 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
unsqueeze__4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = last_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
context_layer_4: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.cat([first_context_layer, second_context_layer, context_layer_3, second_last_context_layer, last_context_layer], dim = 2); first_context_layer = second_context_layer = context_layer_3 = second_last_context_layer = last_context_layer = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
view_20: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = context_layer_4.view((1, 12, 832, -1)); context_layer_4 = None
context_layer_5: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = view_20 * l_from_mask_; view_20 = l_from_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
context_layer_6: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.transpose(context_layer_5, 1, 2); context_layer_5 = None
return (context_layer_6, rand_attn_2)
V0627 17:31:05.175000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "9d92a7e58f208e3c617d3e5fb4f3ee25"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.view.default(cat, [12, 11, 3]); cat = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
alias: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(view); view = None
alias_1: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias); alias = None
alias_2: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias_1); alias_1 = None
convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(alias_2, torch.int64); alias_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0)
clone: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.clone.default(unsqueeze); unsqueeze = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0)
select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(clone, 0, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
view_1: "i64[396][1]cpu" = torch.ops.aten.view.default(select_1, [396]); select_1 = None
index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
clone_1: "f32[396, 64][64, 1]cpu" = torch.ops.aten.clone.default(index); index = None
view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.view.default(clone_1, [1, 396, 64]); clone_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.view.default(view_2, [1, 12, 11, 192]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 0, 0, 9223372036854775807); arg13_1 = None
slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -1); slice_1 = None
unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None
unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None
permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None
unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None
permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None
mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [1, 12, 13, 64, -1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [1, 12, 13, 64, -1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None
mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13); div = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_7: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1])
add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_2, [156, 64, 64]); clone_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = add = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_1, [1, 12, 33, 64, 64]); index_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_9, [1, 12, 11, 192, -1]); view_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
iota_1: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div_1: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota_1, 33, rounding_mode = 'floor'); iota_1 = None
mul_2: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div_1, 13); div_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_11: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1])
add_1: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_11, mul_2); view_11 = mul_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_3, [156, 64, 64]); clone_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add_1]); view_12 = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_2, [1, 12, 33, 64, 64]); index_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_13, [1, 12, 11, 192, -1]); view_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
slice_3: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None
select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_4, 2, 0); slice_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_2, [12, 64, 64]); select_2 = None
view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64])
permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = permute_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm, [1, 12, 64, 832]); bmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None
add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = mul_4 = None
convert_element_type_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_2, torch.bfloat16); add_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
convert_element_type_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_3, torch.float32); convert_element_type_3 = None
amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_4, [-1], True)
sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_4, amax); convert_element_type_4 = amax = None
exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1); sub_1 = None
sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None
convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None
view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64])
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = view_19 = None
view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
slice_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None
select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_6, 2, 0); slice_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
slice_7: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_8: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None
select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_8, 2, 1); slice_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
slice_9: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_10: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None
select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_10, 2, 2); slice_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
slice_11: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_12: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None
select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_12, 2, -1); slice_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
slice_13: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807)
slice_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None
select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_14, 2, 0); slice_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2); select_3 = select_4 = select_5 = select_6 = select_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
slice_15: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_16: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None
select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_16, 2, 0); slice_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
slice_17: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_18: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None
select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_18, 2, 1); slice_18 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
slice_19: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_20: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None
select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_20, 2, 2); slice_20 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
slice_21: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_22: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None
select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_22, 2, -1); slice_22 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
slice_23: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807)
slice_24: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None
select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_24, 2, 0); slice_24 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2); select_8 = select_9 = select_10 = select_11 = select_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
slice_25: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_26: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_25, 1, 0, 9223372036854775807); slice_25 = None
select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_26, 2, 1); slice_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_13, [12, 64, 64]); select_13 = None
view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_1, [-1, 448, 64]); cat_1 = None
permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
slice_27: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_28: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_27, 1, 0, 9223372036854775807); slice_27 = None
slice_29: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_28, 2, 0, 9223372036854775807); slice_28 = None
slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_29, 3, 0, 192); slice_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
slice_31: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_32: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_31, 1, 0, 9223372036854775807); slice_31 = None
slice_33: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_32, 2, 0, 9223372036854775807); slice_32 = None
slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_33, 3, -64, 9223372036854775807); slice_33 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
full: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full], 3); slice_30 = slice_34 = full = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
full_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
slice_35: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807)
slice_36: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_35, 1, 0, 9223372036854775807); slice_35 = None
select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(slice_36, 2, 0); slice_36 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_1, select_14], 3); full_1 = select_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None
sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None
mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None
add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None
convert_element_type_10: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_3, torch.bfloat16); add_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
convert_element_type_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_10, torch.float32); convert_element_type_10 = None
amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_11, [-1], True)
sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_11, amax_1); convert_element_type_11 = amax_1 = None
exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3); sub_3 = None
sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None
convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None
view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_2, [-1, 448, 64]); cat_2 = None
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None
view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
slice_37: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_38: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_37, 1, 0, 9223372036854775807); slice_37 = None
slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_38, 2, 1, -3); slice_38 = None
slice_40: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_41: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_40, 1, 0, 9223372036854775807); slice_40 = None
slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_41, 2, 2, -2); slice_41 = None
slice_43: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_44: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_43, 1, 0, 9223372036854775807); slice_43 = None
slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_44, 2, 3, -1); slice_44 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
slice_46: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_47: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_46, 1, 0, 9223372036854775807); slice_46 = None
slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_47, 2, 1, -3); slice_47 = None
slice_49: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_50: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_49, 1, 0, 9223372036854775807); slice_49 = None
slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_50, 2, 2, -2); slice_50 = None
slice_52: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_53: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_52, 1, 0, 9223372036854775807); slice_52 = None
slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_53, 2, 3, -1); slice_53 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
slice_55: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_56: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_55, 1, 0, 9223372036854775807); slice_55 = None
slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_56, 2, 2, -2); slice_56 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_4, [108, 64, 64]); clone_4 = None
view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_5, [-1, 192, 64]); cat_5 = None
permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4); view_27 = permute_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
slice_58: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807)
slice_59: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_58, 1, 0, 9223372036854775807); slice_58 = None
slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_59, 2, 1, -1); slice_59 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_5: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
view_30: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_5, [108, 64, 64]); clone_5 = None
clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None
view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_6, [108, 192, 64]); clone_6 = None
permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_30, permute_5); view_30 = permute_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:718 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]
slice_61: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_62: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_61, 1, 0, 9223372036854775807); slice_61 = None
select_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_62, 2, 0); slice_62 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None
unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_15, 4); select_15 = None
unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None
permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None
permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None
view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_8, [12, 576, 64]); permute_8 = None
permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None
view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_9, [12, 64, 64]); permute_9 = None
bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34); view_33 = view_34 = None
view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None
permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None
view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:724 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]
slice_63: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_64: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_63, 1, 0, 9223372036854775807); slice_63 = None
select_16: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_64, 2, -1); slice_64 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
unsqueeze_9: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5); slice_57 = None
permute_11: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_9, [0, 1, 2, 3, 5, 4]); unsqueeze_9 = None
unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_16, 4); select_16 = None
unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None
permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None
permute_13: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_11, [1, 2, 3, 5, 0, 4]); permute_11 = None
view_37: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_13, [12, 576, 64]); permute_13 = None
permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None
view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_14, [12, 64, 64]); permute_14 = None
bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_37, view_38); view_37 = view_38 = None
view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None
permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None
view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None
mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None
add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None
convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
slice_65: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_66: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_65, 1, 0, 9223372036854775807); slice_65 = None
slice_67: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_66, 2, 0, 9223372036854775807); slice_66 = None
slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_67, 3, 0, 64); slice_67 = None
unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3); slice_68 = None
sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None
add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None
convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
slice_69: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_70: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_69, 1, 0, 9223372036854775807); slice_69 = None
slice_71: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_70, 2, 0, 9223372036854775807); slice_70 = None
slice_72: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_71, 3, -64, 9223372036854775807); slice_71 = None
unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_72, 3); slice_72 = None
sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None
mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None
add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None
convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
slice_73: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807)
slice_74: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_73, 1, 0, 9223372036854775807); slice_73 = None
slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_74, 2, 1, -1); slice_74 = None
sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None
mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None
add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None
convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None
amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None
exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8); sub_8 = None
sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None
convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
slice_76: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
slice_77: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_76, 1, 0, 9223372036854775807); slice_76 = None
slice_78: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_77, 2, 0, 9223372036854775807); slice_77 = None
slice_79: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_78, 3, 0, 9223372036854775807); slice_78 = None
slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_79, 4, 64, 256); slice_79 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_80, [108, 64, 192]); slice_80 = None
view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_6, [-1, 192, 64]); cat_6 = None
bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None
view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
slice_81: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
slice_82: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_81, 1, 0, 9223372036854775807); slice_81 = None
slice_83: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_82, 2, 0, 9223372036854775807); slice_82 = None
slice_84: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_83, 3, 0, 9223372036854775807); slice_83 = None
slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_84, 4, 256, -64); slice_84 = None
slice_86: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807)
slice_87: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_86, 1, 0, 9223372036854775807); slice_86 = None
slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_87, 2, 1, -1); slice_87 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_85, [108, 64, 192]); slice_85 = None
clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None
view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_7, [108, 192, 64]); clone_7 = None
bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None
view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None
view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_8, [108, 64, 64]); add_8 = None
view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_47, [1, 12, 9, 64, 64]); view_47 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
slice_89: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
slice_90: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_89, 1, 0, 9223372036854775807); slice_89 = None
slice_91: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_90, 2, 0, 9223372036854775807); slice_90 = None
slice_92: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_91, 3, 0, 9223372036854775807); slice_91 = None
slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_92, 4, 0, 64); slice_92 = None
slice_94: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_95: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_94, 1, 0, 9223372036854775807); slice_94 = None
select_17: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_95, 2, 0); slice_95 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None
permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None
unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_17, 4); select_17 = None
unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None
permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None
permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None
view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_18, [12, 576, 64]); permute_18 = None
permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None
view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_19, [12, 64, 64]); permute_19 = None
bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None
view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None
permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None
view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None
add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None
view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_9, [108, 64, 64]); add_9 = None
view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_53, [1, 12, 9, 64, 64]); view_53 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
slice_96: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807); convert_element_type_28 = None
slice_97: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_96, 1, 0, 9223372036854775807); slice_96 = None
slice_98: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_97, 2, 0, 9223372036854775807); slice_97 = None
slice_99: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_98, 3, 0, 9223372036854775807); slice_98 = None
slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_99, 4, -64, 9223372036854775807); slice_99 = None
slice_101: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_102: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_101, 1, 0, 9223372036854775807); slice_101 = None
select_18: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_102, 2, -1); slice_102 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None
permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None
unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_18, 4); select_18 = None
unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None
permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None
permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None
view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_23, [12, 576, 64]); permute_23 = None
permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None
view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_24, [12, 64, 64]); permute_24 = None
bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None
view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None
permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None
view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None
add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None
view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_10, [108, 64, 64]); add_10 = None
view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_59, [1, 12, 9, 64, 64]); view_59 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:775 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
slice_103: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_104: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_103, 1, 0, 9223372036854775807); slice_103 = None
select_19: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_104, 2, 0); slice_104 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
slice_105: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_106: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_105, 1, 0, 9223372036854775807); slice_105 = None
select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_106, 2, -3); slice_106 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
slice_107: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_108: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_107, 1, 0, 9223372036854775807); slice_107 = None
select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_108, 2, -2); slice_108 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:778 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
slice_109: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807); view_5 = None
slice_110: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_109, 1, 0, 9223372036854775807); slice_109 = None
select_22: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_110, 2, -1); slice_110 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
slice_111: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807); view_10 = None
slice_112: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_111, 1, 0, 9223372036854775807); slice_111 = None
select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_112, 2, -1); slice_112 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_19, select_20, select_21, select_22, select_23], 2); select_19 = select_20 = select_21 = select_22 = select_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:785 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
slice_113: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_114: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_113, 1, 0, 9223372036854775807); slice_113 = None
select_24: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_114, 2, 0); slice_114 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
slice_115: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_116: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_115, 1, 0, 9223372036854775807); slice_115 = None
select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_116, 2, -3); slice_116 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
slice_117: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_118: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_117, 1, 0, 9223372036854775807); slice_117 = None
select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_118, 2, -2); slice_118 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:788 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
slice_119: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807); view_6 = None
slice_120: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_119, 1, 0, 9223372036854775807); slice_119 = None
select_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_120, 2, -1); slice_120 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
slice_121: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807); view_14 = None
slice_122: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_121, 1, 0, 9223372036854775807); slice_121 = None
select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_122, 2, -1); slice_122 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_24, select_25, select_26, select_27, select_28], 2); select_24 = select_25 = select_26 = select_27 = select_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
slice_123: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_124: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_123, 1, 0, 9223372036854775807); slice_123 = None
select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_124, 2, -2); slice_124 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_29, [12, 64, 64]); select_29 = None
view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_8, [-1, 448, 64]); cat_8 = None
permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:798 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size],
slice_125: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_126: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_125, 1, 0, 9223372036854775807); slice_125 = None
slice_127: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_126, 2, 0, 9223372036854775807); slice_126 = None
slice_128: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_127, 3, 0, 64); slice_127 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
slice_129: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_130: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_129, 1, 0, 9223372036854775807); slice_129 = None
slice_131: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_130, 2, 0, 9223372036854775807); slice_130 = None
slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_131, 3, -192, 9223372036854775807); slice_131 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:800 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
full_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_128, slice_132, full_2], 3); slice_128 = slice_132 = full_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:806 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
full_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
slice_133: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807); mul = None
slice_134: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_133, 1, 0, 9223372036854775807); slice_133 = None
select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(slice_134, 2, -1); slice_134 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_3, select_30], 3); full_3 = select_30 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None
sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None
mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None
add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None
convert_element_type_39: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_11, torch.bfloat16); add_11 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
convert_element_type_40: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_39, torch.float32); convert_element_type_39 = None
amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_40, [-1], True)
sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_40, amax_3); convert_element_type_40 = amax_3 = None
exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10); sub_10 = None
sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None
convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None
view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_9, [-1, 448, 64]); cat_9 = None
bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None
view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
slice_135: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807); view_4 = None
slice_136: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_135, 1, 0, 9223372036854775807); slice_135 = None
select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_136, 2, -1); slice_136 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_31, [12, 64, 64]); select_31 = None
view_68: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64]); arg14_1 = None
permute_27: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_68, [0, 2, 1]); view_68 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_27); view_67 = permute_27 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
sub_11: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1); arg16_1 = None
mul_18: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub_11, -10000.0); sub_11 = None
add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_18); mul_17 = mul_18 = None
convert_element_type_46: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_12, torch.bfloat16); add_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
convert_element_type_47: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_46, torch.float32); convert_element_type_46 = None
amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_47, [-1], True)
sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_47, amax_4); convert_element_type_47 = amax_4 = None
exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12); sub_12 = None
sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None
convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None
view_71: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64]); arg15_1 = None
bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_71); view_70 = view_71 = None
view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.view.default(cat_12, [1, 12, 832, -1]); cat_12 = None
mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None
return (permute_28, clone)
V0627 17:31:05.639000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "ee4f5da4b7396f62d53589c7ddc358c5"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]); select_2 = None
view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64])
permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]); bmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None
add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_2, [-1], True)
sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, amax); add_2 = amax = None
exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1); sub_1 = None
sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None
convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None
view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64])
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = None
view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]); select_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1]); arg14_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]); clone_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.reshape.default(cat, [12, 11, 3]); cat = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(view, torch.int64); view = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_7: "i64[396][1]cpu" = torch.ops.aten.reshape.default(unsqueeze, [-1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None
mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13); div = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]); index_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]); view_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2); select_4 = select_5 = select_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]); cat_1 = None
permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
full_default: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full_default], 3); slice_30 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
full_default_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1)
unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None
unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None
permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0); arg13_1 = None
select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(unsqueeze, 0, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
view_1: "i64[396][1]cpu" = torch.ops.aten.reshape.default(select_1, [396]); select_1 = None
index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.reshape.default(index, [1, 396, 64]); index = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None
permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None
mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_14], 3); select_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None
sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None
mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None
add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_3, [-1], True)
sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_3, amax_1); add_3 = amax_1 = None
exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3); sub_3 = None
sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None
convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1]); arg15_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]); clone_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add]); view_12 = add = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]); index_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]); view_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2); select_9 = select_10 = select_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]); cat_2 = None
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None
view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None
permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None
view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]); permute_8 = None
unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_3, 4)
unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None
permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None
permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None
view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]); permute_9 = None
bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34); view_34 = None
view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None
permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None
view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64)
unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3)
sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None
add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None
convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format); slice_57 = None
view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]); clone_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3)
slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2)
slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]); cat_5 = None
permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4); permute_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None
mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None
add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None
convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None
view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]); clone_6 = None
permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_5); view_27 = permute_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1)
sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None
mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None
add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None
convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_6, 4)
unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None
permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None
permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None
view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]); permute_14 = None
bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_38); view_33 = view_38 = None
view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None
permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None
view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_34, 3); slice_34 = None
sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None
mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None
add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None
convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None
amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None
exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8); sub_8 = None
sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None
convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]); slice_80 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3)
slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2)
slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]); cat_6 = None
bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None
view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]); slice_85 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None
view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]); clone_7 = None
bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None
view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None
view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]); add_8 = None
view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]); view_47 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None
permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None
permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None
view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]); permute_18 = None
unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_8, 4)
unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None
permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None
permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None
view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]); permute_19 = None
bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None
view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None
permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None
view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None
add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None
view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]); add_9 = None
view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]); view_53 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807); convert_element_type_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None
permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None
permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None
view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]); permute_23 = None
unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_11, 4)
unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None
permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None
permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None
view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]); permute_24 = None
bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None
view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None
permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None
view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None
add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None
view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]); add_10 = None
view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]); view_59 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]); select_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -3)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -2); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, -1); view_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_20, select_21, select_6, select_23], 2); select_3 = select_20 = select_21 = select_6 = select_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]); cat_8 = None
permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807); arg16_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_68, slice_132, full_default], 3); slice_68 = slice_132 = full_default = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, -1); mul = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_30], 3); full_default_1 = select_30 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None
sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None
mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None
add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_11, [-1], True)
sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_11, amax_3); add_11 = amax_3 = None
exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10); sub_10 = None
sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None
convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -3)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -2); view_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, -1); view_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_25, select_26, select_11, select_28], 2); select_8 = select_25 = select_26 = select_11 = select_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]); cat_9 = None
bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None
view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -1); view_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]); select_31 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_2); view_67 = permute_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_4); mul_17 = mul_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_12, [-1], True)
sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_12, amax_4); add_12 = amax_4 = None
exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12); sub_12 = None
sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None
convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None
bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_19); view_70 = view_19 = None
view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]); cat_12 = None
mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None
return (permute_28, unsqueeze)
V0627 17:31:09.541000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/6i/c6icdm2jkh5xkxrgpyz2vtbd5oehca45dznneh7n63f3sirkkptn.py"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "47217ba55691917867319806954aafb8"}
# AOT ID: ['8_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused__softmax_add_mul_rsub_0 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
bfloat16* out_ptr3)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
{
{
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13);
}
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
{
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp14 = out_ptr0[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp15 = at::vec::Vectorized<float>(tmp14);
auto tmp16 = tmp13 - tmp15;
auto tmp17 = tmp16.exp();
tmp17.store(out_ptr1 + static_cast<long>(x1 + (832L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp17;
}
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp1 = out_ptr2[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3);
tmp4.store(out_ptr3 + static_cast<long>(x1 + (832L*x0)), 16);
}
}
}
}
}
''')
cpp_fused__to_copy_cat_stack_1 = async_compile.cpp_pybinding(['const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const bfloat16*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int64_t*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int32_t* in_ptr0,
const int32_t* in_ptr1,
const int32_t* in_ptr2,
const int32_t* in_ptr3,
const int32_t* in_ptr4,
const int32_t* in_ptr5,
const int32_t* in_ptr6,
const int32_t* in_ptr7,
const int32_t* in_ptr8,
const int32_t* in_ptr9,
const int32_t* in_ptr10,
const int32_t* in_ptr11,
const int32_t* in_ptr12,
const bfloat16* in_ptr13,
int32_t* out_ptr0,
int32_t* out_ptr1,
int32_t* out_ptr2,
int32_t* out_ptr3,
int32_t* out_ptr4,
int32_t* out_ptr5,
int32_t* out_ptr6,
int32_t* out_ptr7,
int32_t* out_ptr8,
int32_t* out_ptr9,
int32_t* out_ptr10,
int32_t* out_ptr11,
int64_t* out_ptr12,
bfloat16* out_ptr13,
bfloat16* out_ptr14,
bfloat16* out_ptr15,
bfloat16* out_ptr16,
bfloat16* out_ptr17,
bfloat16* out_ptr18,
bfloat16* out_ptr19,
bfloat16* out_ptr20)
{
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr0 + static_cast<long>(x0), 16);
tmp0.store(out_ptr0 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
out_ptr0[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr1 + static_cast<long>(x0), 16);
tmp0.store(out_ptr1 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr1[static_cast<long>(x0)];
out_ptr1[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr2 + static_cast<long>(x0), 16);
tmp0.store(out_ptr2 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr2[static_cast<long>(x0)];
out_ptr2[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr3 + static_cast<long>(x0), 16);
tmp0.store(out_ptr3 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr3[static_cast<long>(x0)];
out_ptr3[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr4 + static_cast<long>(x0), 16);
tmp0.store(out_ptr4 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr4[static_cast<long>(x0)];
out_ptr4[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr5 + static_cast<long>(x0), 16);
tmp0.store(out_ptr5 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr5[static_cast<long>(x0)];
out_ptr5[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr6 + static_cast<long>(x0), 16);
tmp0.store(out_ptr6 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr6[static_cast<long>(x0)];
out_ptr6[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr7 + static_cast<long>(x0), 16);
tmp0.store(out_ptr7 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr7[static_cast<long>(x0)];
out_ptr7[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr8 + static_cast<long>(x0), 16);
tmp0.store(out_ptr8 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr8[static_cast<long>(x0)];
out_ptr8[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr9 + static_cast<long>(x0), 16);
tmp0.store(out_ptr9 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr9[static_cast<long>(x0)];
out_ptr9[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr10 + static_cast<long>(x0), 16);
tmp0.store(out_ptr10 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr10[static_cast<long>(x0)];
out_ptr10[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr11 + static_cast<long>(x0), 16);
tmp0.store(out_ptr11 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr11[static_cast<long>(x0)];
out_ptr11[static_cast<long>(x0)] = tmp0;
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(384L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr12 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<int64_t,2,int32_t,1>(tmp0);
tmp1.store(out_ptr12 + static_cast<long>(x0), 16);
}
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(384L); x0<static_cast<long>(396L); x0+=static_cast<long>(1L))
{
auto tmp0 = in_ptr12[static_cast<long>(x0)];
auto tmp1 = c10::convert<int64_t>(tmp0);
out_ptr12[static_cast<long>(x0)] = tmp1;
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr13 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
tmp0.store(out_ptr14 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr15 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr16 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr17 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
tmp0.store(out_ptr18 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
auto tmp0 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))];
auto tmp13 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))];
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L));
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = 156L;
auto tmp5 = c10::convert<int64_t>(tmp4);
auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
auto tmp7 = tmp3 < 0;
auto tmp8 = tmp7 ? tmp6 : tmp3;
auto tmp9 = tmp8;
auto tmp10 = c10::convert<int64_t>(tmp9);
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
auto tmp12 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L));
auto tmp15 = c10::convert<int64_t>(tmp14);
auto tmp16 = decltype(tmp13)(tmp13 + tmp15);
auto tmp17 = decltype(tmp16)(tmp16 + tmp5);
auto tmp18 = tmp16 < 0;
auto tmp19 = tmp18 ? tmp17 : tmp16;
auto tmp20 = tmp19;
auto tmp21 = c10::convert<int64_t>(tmp20);
TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L");
auto tmp23 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))];
out_ptr19[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12;
out_ptr20[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23;
}
}
}
}
}
''')
cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
const int64_t* in_ptr2,
const bfloat16* in_ptr3,
const float* in_ptr4,
const float* in_ptr5,
const bfloat16* in_ptr6,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
float* out_ptr7,
float* out_ptr8,
bfloat16* out_ptr9,
bfloat16* out_ptr10,
bfloat16* out_ptr11,
bfloat16* out_ptr12,
bfloat16* out_ptr13,
bfloat16* out_ptr14,
bfloat16* out_ptr15,
bfloat16* out_ptr16,
bfloat16* out_ptr17)
{
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
tmp0.store(out_ptr0 + static_cast<long>(x0));
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(768L + x0), 16);
tmp0.store(out_ptr1 + static_cast<long>(x0));
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
{
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0);
tmp1.store(out_ptr2 + static_cast<long>(x0));
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L))
{
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0);
tmp1.store(out_ptr3 + static_cast<long>(x1 + (448L*x0)));
}
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
{
auto tmp0 = in_ptr1[static_cast<long>(64L + x1)];
auto tmp1 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
{
tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((x2 + x2_inner + (2112L*x0)), 64L))];
}
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
}
()
;
auto tmp16 = in_ptr1[static_cast<long>(704L + x1)];
auto tmp17 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
{
tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((1920L + x2 + x2_inner + (2112L*x0)), 64L))];
}
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
}
()
;
auto tmp2 = 13L;
auto tmp3 = c10::convert<int64_t>(tmp2);
auto tmp4 = at::vec::VectorizedN<int64_t,2>(tmp3);
auto tmp5 = tmp1 + tmp4;
auto tmp6 = static_cast<int64_t>(0);
auto tmp7 = at::vec::VectorizedN<int64_t,2>(tmp6);
auto tmp8 = at::vec::VecMask<int64_t,2>(tmp1 < tmp7);
auto tmp9 = decltype(tmp5)::blendv(tmp1, tmp5, tmp8.template cast<int64_t,2>());
auto tmp10 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
tmp9.store(tmpbuf.data());
return tmpbuf;
}
()
;
auto tmp11 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
{
tmpbuf[x2_inner] = static_cast<long>(tmp10[x2_inner]);
}
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
}
()
;
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp11) & (tmp11 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp11 < 13L");
auto tmp13 =
[&]
{
__at_align__ std::array<float, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
{
tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp10[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))];
}
return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
}
()
;
auto tmp14 = at::vec::Vectorized<float>(tmp0);
auto tmp15 = tmp14 * tmp13;
auto tmp18 = tmp17 + tmp4;
auto tmp19 = at::vec::VecMask<int64_t,2>(tmp17 < tmp7);
auto tmp20 = decltype(tmp18)::blendv(tmp17, tmp18, tmp19.template cast<int64_t,2>());
auto tmp21 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
tmp20.store(tmpbuf.data());
return tmpbuf;
}
()
;
auto tmp22 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
{
tmpbuf[x2_inner] = static_cast<long>(tmp21[x2_inner]);
}
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
}
()
;
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp22) & (tmp22 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp22 < 13L");
auto tmp24 =
[&]
{
__at_align__ std::array<float, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
{
tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp21[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))];
}
return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
}
()
;
auto tmp25 = at::vec::Vectorized<float>(tmp16);
auto tmp26 = tmp25 * tmp24;
tmp15.store(out_ptr4 + static_cast<long>(x2 + (448L*x1) + (28672L*x0)));
tmp26.store(out_ptr5 + static_cast<long>(x2 + (448L*x1) + (28672L*x0)));
}
}
}
}
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
{
{
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15);
}
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
{
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp16 = out_ptr6[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = tmp15 - tmp17;
auto tmp19 = tmp18.exp();
tmp19.store(out_ptr7 + static_cast<long>(x1 + (448L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp19;
}
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr8[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr7 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = out_ptr8[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3);
tmp4.store(out_ptr9 + static_cast<long>(x1 + (448L*x0)), 16);
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr10 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
tmp0.store(out_ptr11 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr12 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr13 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr14 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
tmp0.store(out_ptr15 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
auto tmp0 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))];
auto tmp13 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))];
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L));
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = 156L;
auto tmp5 = c10::convert<int64_t>(tmp4);
auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
auto tmp7 = tmp3 < 0;
auto tmp8 = tmp7 ? tmp6 : tmp3;
auto tmp9 = tmp8;
auto tmp10 = c10::convert<int64_t>(tmp9);
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
auto tmp12 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L));
auto tmp15 = c10::convert<int64_t>(tmp14);
auto tmp16 = decltype(tmp13)(tmp13 + tmp15);
auto tmp17 = decltype(tmp16)(tmp16 + tmp5);
auto tmp18 = tmp16 < 0;
auto tmp19 = tmp18 ? tmp17 : tmp16;
auto tmp20 = tmp19;
auto tmp21 = c10::convert<int64_t>(tmp20);
TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L");
auto tmp23 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))];
out_ptr16[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12;
out_ptr17[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23;
}
}
}
}
}
}
}
''')
cpp_fused_cat_clone_3 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const bfloat16* in_ptr1,
bfloat16* out_ptr0,
bfloat16* out_ptr1,
bfloat16* out_ptr2,
bfloat16* out_ptr3)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
tmp0.store(out_ptr0 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
}
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
tmp0.store(out_ptr1 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
}
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
tmp0.store(out_ptr2 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
}
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr3 + static_cast<long>(x2 + (64L*x1) + (36864L*x0)), 32);
}
}
}
}
}
}
''')
cpp_fused_clone_4 = async_compile.cpp_pybinding(['const int64_t*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int64_t* in_ptr0,
const bfloat16* in_ptr1,
const bfloat16* in_ptr2,
bfloat16* out_ptr0,
bfloat16* out_ptr1)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L)))];
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L))), 33L));
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = 156L;
auto tmp5 = c10::convert<int64_t>(tmp4);
auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
auto tmp7 = tmp3 < 0;
auto tmp8 = tmp7 ? tmp6 : tmp3;
auto tmp9 = tmp8;
auto tmp10 = c10::convert<int64_t>(tmp9);
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
auto tmp12 = in_ptr1[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
auto tmp13 = in_ptr2[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
out_ptr0[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp12;
out_ptr1[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp13;
}
}
}
}
}
}
}
''')
cpp_fused__softmax__to_copy_add_cat_mul_rsub_5 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
const bfloat16* in_ptr2,
const float* in_ptr3,
const bfloat16* in_ptr4,
const float* in_ptr5,
const int64_t* in_ptr6,
const bfloat16* in_ptr7,
const bfloat16* in_ptr8,
const bfloat16* in_ptr9,
bfloat16* out_ptr0,
bfloat16* out_ptr1,
bfloat16* out_ptr2,
bfloat16* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
bfloat16* out_ptr7,
bfloat16* out_ptr8,
bfloat16* out_ptr9,
bfloat16* out_ptr10)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (64L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp14 = at::vec::convert<bfloat16>(tmp13);
tmp14.store(out_ptr0 + static_cast<long>(x1 + (512L*x0)), 16);
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x2 + (192L*x1) + (110592L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x2 + (192L*x1)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp14 = at::vec::convert<bfloat16>(tmp13);
tmp14.store(out_ptr1 + static_cast<long>(x2 + (512L*x1) + (294912L*x0)), 16);
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
for(long x3=static_cast<long>(0L); x3<static_cast<long>(192L); x3+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x3 + (192L*x2) + (12288L*x1) + (110592L*x0)), 16);
auto tmp6 = in_ptr5[static_cast<long>(128L + x2 + (64L*x1))];
auto tmp7 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x3_inner = 0; x3_inner < 16; x3_inner++)
{
tmpbuf[x3_inner] = in_ptr6[static_cast<long>(c10::div_floor_integer((192L + x3 + x3_inner + (192L*x1) + (2112L*x0)), 64L))];
}
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
}
()
;
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = 13L;
auto tmp9 = c10::convert<int64_t>(tmp8);
auto tmp10 = at::vec::VectorizedN<int64_t,2>(tmp9);
auto tmp11 = tmp7 + tmp10;
auto tmp12 = static_cast<int64_t>(0);
auto tmp13 = at::vec::VectorizedN<int64_t,2>(tmp12);
auto tmp14 = at::vec::VecMask<int64_t,2>(tmp7 < tmp13);
auto tmp15 = decltype(tmp11)::blendv(tmp7, tmp11, tmp14.template cast<int64_t,2>());
auto tmp16 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
tmp15.store(tmpbuf.data());
return tmpbuf;
}
()
;
auto tmp17 =
[&]
{
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x3_inner = 0; x3_inner < 16; x3_inner++)
{
tmpbuf[x3_inner] = static_cast<long>(tmp16[x3_inner]);
}
return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
}
()
;
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp17) & (tmp17 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp17 < 13L");
auto tmp19 =
[&]
{
__at_align__ std::array<float, 16> tmpbuf;
#pragma GCC unroll 16
for (long x3_inner = 0; x3_inner < 16; x3_inner++)
{
tmpbuf[x3_inner] = in_ptr5[static_cast<long>((64L*tmp16[x3_inner]) + (static_cast<long>((x3 + x3_inner)) % static_cast<long>(64L)))];
}
return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
}
()
;
auto tmp20 = at::vec::Vectorized<float>(tmp6);
auto tmp21 = tmp20 * tmp19;
auto tmp22 = static_cast<float>(1.0);
auto tmp23 = at::vec::Vectorized<float>(tmp22);
auto tmp24 = tmp23 - tmp21;
auto tmp25 = static_cast<float>(-10000.0);
auto tmp26 = at::vec::Vectorized<float>(tmp25);
auto tmp27 = tmp24 * tmp26;
auto tmp28 = tmp5 + tmp27;
auto tmp29 = at::vec::convert<bfloat16>(tmp28);
tmp29.store(out_ptr2 + static_cast<long>(x3 + (512L*x2) + (32768L*x1) + (294912L*x0)), 16);
}
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (64L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(768L + x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp14 = at::vec::convert<bfloat16>(tmp13);
tmp14.store(out_ptr3 + static_cast<long>(x1 + (512L*x0)), 16);
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
{
{
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp1);
}
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
{
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16);
auto tmp2 = out_ptr4[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = tmp4.exp();
tmp5.store(out_ptr5 + static_cast<long>(x1 + (512L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp5;
}
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (512L*x0)), 16);
auto tmp1 = out_ptr6[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3);
tmp4.store(out_ptr7 + static_cast<long>(x1 + (512L*x0)), 16);
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
tmp0.store(out_ptr8 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
}
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
tmp0.store(out_ptr9 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
}
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
{
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
tmp0.store(out_ptr10 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
}
}
}
}
}
}
}
''')
cpp_fused_cat_6 = async_compile.cpp_pybinding(['const bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
bfloat16* out_ptr0,
bfloat16* out_ptr1)
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr0 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr1 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
}
''')
cpp_fused__softmax_add_cat_minimum_mul_rsub_7 = async_compile.cpp_pybinding(['const float*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
const bfloat16* in_ptr1,
const float* in_ptr2,
const float* in_ptr3,
const bfloat16* in_ptr4,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
bfloat16* out_ptr7,
bfloat16* out_ptr8,
bfloat16* out_ptr9)
{
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
tmp0.store(out_ptr0 + static_cast<long>(x0));
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(640L + x0), 16);
tmp0.store(out_ptr1 + static_cast<long>(x0));
}
}
{
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
{
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0);
tmp1.store(out_ptr2 + static_cast<long>(x0));
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L))
{
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0);
tmp1.store(out_ptr3 + static_cast<long>(x1 + (448L*x0)));
}
}
}
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
{
{
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15);
}
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
{
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp16 = out_ptr4[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = tmp15 - tmp17;
auto tmp19 = tmp18.exp();
tmp19.store(out_ptr5 + static_cast<long>(x1 + (448L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp19;
}
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = out_ptr6[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3);
tmp4.store(out_ptr7 + static_cast<long>(x1 + (448L*x0)), 16);
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr8 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32);
tmp0.store(out_ptr9 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
}
}
}
}
}
}
}
''')
cpp_fused__softmax_add_mul_rsub_8 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
bfloat16* out_ptr3)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
{
{
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13);
}
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
{
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp14 = out_ptr0[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp15 = at::vec::Vectorized<float>(tmp14);
auto tmp16 = tmp13 - tmp15;
auto tmp17 = tmp16.exp();
tmp17.store(out_ptr1 + static_cast<long>(x1 + (832L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp17;
}
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp1 = out_ptr2[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3);
tmp4.store(out_ptr3 + static_cast<long>(x1 + (832L*x0)), 16);
}
}
}
}
}
''')
cpp_fused_cat_mul_9 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const bfloat16* in_ptr1,
const bfloat16* in_ptr2,
const bfloat16* in_ptr3,
const bfloat16* in_ptr4,
const bfloat16* in_ptr5,
const bfloat16* in_ptr6,
const bfloat16* in_ptr7,
const bfloat16* in_ptr8,
const float* in_ptr9,
bfloat16* out_ptr0,
bfloat16* out_ptr1,
bfloat16* out_ptr2,
bfloat16* out_ptr3,
bfloat16* out_ptr4,
float* out_ptr5)
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (4096L*x0)), 32);
tmp0.store(out_ptr0 + static_cast<long>(x1 + (53248L*x0)), 32);
}
}
}
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (4096L*x0)), 32);
tmp0.store(out_ptr1 + static_cast<long>(x1 + (53248L*x0)), 32);
}
}
}
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(36864L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp2 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp5 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp8 = at::vec::Vectorized<bfloat16>::loadu(in_ptr5 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = at::vec::convert<float>(tmp2);
auto tmp4 = tmp1 + tmp3;
auto tmp6 = at::vec::convert<float>(tmp5);
auto tmp7 = tmp4 + tmp6;
auto tmp9 = at::vec::convert<float>(tmp8);
auto tmp10 = tmp7 + tmp9;
auto tmp11 = at::vec::convert<bfloat16>(tmp10);
tmp11.store(out_ptr2 + static_cast<long>(x1 + (53248L*x0)), 16);
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x1 + (4096L*x0)), 32);
tmp0.store(out_ptr3 + static_cast<long>(x1 + (53248L*x0)), 32);
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (4096L*x0)), 32);
tmp0.store(out_ptr4 + static_cast<long>(x1 + (53248L*x0)), 32);
}
}
}
}
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x2 + (64L*x1) + (53248L*x0)), 16);
auto tmp2 = in_ptr9[static_cast<long>(x1)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
tmp4.store(out_ptr5 + static_cast<long>(x2 + (64L*x1) + (53248L*x0)));
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1 = args
args.clear()
assert_size_stride(arg0_1, (11, 3), (3, 1))
assert_size_stride(arg1_1, (11, 3), (3, 1))
assert_size_stride(arg2_1, (11, 3), (3, 1))
assert_size_stride(arg3_1, (11, 3), (3, 1))
assert_size_stride(arg4_1, (11, 3), (3, 1))
assert_size_stride(arg5_1, (11, 3), (3, 1))
assert_size_stride(arg6_1, (11, 3), (3, 1))
assert_size_stride(arg7_1, (11, 3), (3, 1))
assert_size_stride(arg8_1, (11, 3), (3, 1))
assert_size_stride(arg9_1, (11, 3), (3, 1))
assert_size_stride(arg10_1, (11, 3), (3, 1))
assert_size_stride(arg11_1, (11, 3), (3, 1))
assert_size_stride(arg12_1, (1, 12, 832, 64), (638976, 64, 768, 1))
assert_size_stride(arg13_1, (1, 13, 64), (832, 64, 1))
assert_size_stride(arg14_1, (1, 12, 832, 64), (638976, 64, 768, 1))
assert_size_stride(arg15_1, (1, 12, 832, 64), (638976, 64, 768, 1))
assert_size_stride(arg16_1, (1, 1, 1, 832), (832, 832, 832, 1))
assert_size_stride(arg17_1, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1))
assert_size_stride(arg18_1, (1, 1, 832, 1), (832, 832, 1, 1))
buf0 = empty_strided_cpu((12, 64, 832), (53248, 832, 1), torch.bfloat16)
# Source Nodes: [bmm], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 0), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf0)
buf1 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32)
buf2 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.float32)
buf3 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32)
buf4 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.bfloat16)
cpp_fused__softmax_add_mul_rsub_0(buf0, arg16_1, buf1, buf2, buf3, buf4)
buf5 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_1], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf5)
buf18 = empty_strided_cpu((132, 3), (3, 1), torch.int32)
buf6 = reinterpret_tensor(buf18, (11, 3), (3, 1), 0) # alias
buf7 = reinterpret_tensor(buf18, (11, 3), (3, 1), 33) # alias
buf8 = reinterpret_tensor(buf18, (11, 3), (3, 1), 66) # alias
buf9 = reinterpret_tensor(buf18, (11, 3), (3, 1), 99) # alias
buf10 = reinterpret_tensor(buf18, (11, 3), (3, 1), 132) # alias
buf11 = reinterpret_tensor(buf18, (11, 3), (3, 1), 165) # alias
buf12 = reinterpret_tensor(buf18, (11, 3), (3, 1), 198) # alias
buf13 = reinterpret_tensor(buf18, (11, 3), (3, 1), 231) # alias
buf14 = reinterpret_tensor(buf18, (11, 3), (3, 1), 264) # alias
buf15 = reinterpret_tensor(buf18, (11, 3), (3, 1), 297) # alias
buf16 = reinterpret_tensor(buf18, (11, 3), (3, 1), 330) # alias
buf17 = reinterpret_tensor(buf18, (11, 3), (3, 1), 363) # alias
buf19 = empty_strided_cpu((12, 11, 3), (33, 3, 1), torch.int64)
buf25 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf20 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf78 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf73 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf21 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf22 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
buf23 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf76 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf24 = reinterpret_tensor(buf25, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
buf77 = reinterpret_tensor(buf78, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
cpp_fused__to_copy_cat_stack_1(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, buf18, arg14_1, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf19, buf20, buf73, buf21, buf22, buf23, buf76, buf24, buf77)
del arg0_1
del arg10_1
del arg11_1
del arg1_1
del arg2_1
del arg3_1
del arg4_1
del arg5_1
del arg6_1
del arg7_1
del arg8_1
del arg9_1
del buf10
del buf11
del buf12
del buf13
del buf14
del buf15
del buf16
del buf17
del buf18
del buf20
del buf21
del buf22
del buf23
del buf24
del buf6
del buf7
del buf8
del buf9
buf26 = empty_strided_cpu((12, 64, 448), (28672, 448, 1), torch.bfloat16)
# Source Nodes: [bmm_2], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 49152), reinterpret_tensor(buf25, (12, 64, 448), (28672, 1, 64), 0), out=buf26)
buf30 = empty_strided_cpu((1, 1, 1, 448), (448, 448, 448, 1), torch.float32)
buf27 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 0) # alias
buf28 = reinterpret_tensor(buf30, (1, 1, 1, 64), (448, 448, 448, 1), 192) # alias
buf29 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 256) # alias
buf33 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
buf31 = reinterpret_tensor(buf33, (1, 12, 64, 256), (344064, 28672, 448, 1), 0) # alias
buf32 = reinterpret_tensor(buf33, (1, 12, 64, 192), (344064, 28672, 448, 1), 256) # alias
buf86 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
buf85 = reinterpret_tensor(buf86, (1, 12, 64, 192), (344064, 28672, 448, 1), 256) # alias
buf34 = buf3; del buf3 # reuse
buf35 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
buf36 = buf1; del buf1 # reuse
buf43 = reinterpret_tensor(buf25, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf25 # reuse
buf42 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf37 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf95 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf90 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf38 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf39 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
buf40 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf93 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf41 = reinterpret_tensor(buf42, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
buf94 = reinterpret_tensor(buf95, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2(arg16_1, arg13_1, buf19, buf26, buf30, buf33, arg15_1, buf27, buf28, buf29, buf31, buf32, buf85, buf34, buf35, buf36, buf43, buf37, buf90, buf38, buf39, buf40, buf93, buf41, buf94)
del buf26
del buf27
del buf28
del buf29
del buf31
del buf32
del buf33
del buf37
del buf38
del buf39
del buf40
del buf41
buf44 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_3], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf42, (12, 448, 64), (28672, 64, 1), 0), out=buf44)
del buf42
buf45 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
# Source Nodes: [first_band_product], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 0), out=buf45)
buf49 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16)
buf46 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0) # alias
buf47 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096) # alias
buf48 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192) # alias
buf50 = empty_strided_cpu((1, 12, 9, 64, 64), (442368, 36864, 4096, 64, 1), torch.bfloat16)
cpp_fused_cat_clone_3(arg14_1, arg12_1, buf46, buf47, buf48, buf50)
del buf46
del buf47
del buf48
buf51 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16)
# Source Nodes: [bmm_4], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf49, (108, 64, 192), (12288, 1, 64), 0), out=buf51)
buf52 = buf49; del buf49 # reuse
buf69 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16)
cpp_fused_clone_4(buf19, arg14_1, arg15_1, buf52, buf69)
buf53 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16)
# Source Nodes: [bmm_5], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf52, (108, 64, 192), (12288, 1, 64), 0), out=buf53)
buf54 = reinterpret_tensor(buf50, (12, 576, 64), (36864, 64, 1), 0); del buf50 # reuse
# Source Nodes: [last_band_product], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 589824), out=buf54)
buf59 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16)
buf55 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 0) # alias
buf56 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 64) # alias
buf57 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 256) # alias
buf58 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 448) # alias
buf60 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32)
buf61 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.float32)
buf62 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32)
buf67 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16)
buf66 = buf52; del buf52 # reuse
buf63 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0) # alias
buf64 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096) # alias
buf65 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192) # alias
cpp_fused__softmax__to_copy_add_cat_mul_rsub_5(buf45, arg16_1, buf51, arg17_1, buf53, arg13_1, buf19, buf54, buf59, arg15_1, buf55, buf56, buf57, buf58, buf60, buf61, buf62, buf67, buf63, buf64, buf65)
del arg13_1
del arg17_1
del buf51
del buf53
del buf55
del buf56
del buf57
del buf58
del buf59
del buf60
del buf61
del buf62
del buf63
del buf64
del buf65
buf68 = reinterpret_tensor(buf54, (108, 64, 64), (4096, 64, 1), 0); del buf54 # reuse
# Source Nodes: [bmm_6], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 64), reinterpret_tensor(buf66, (108, 192, 64), (12288, 64, 1), 0), out=buf68)
del buf66
buf70 = reinterpret_tensor(buf45, (108, 64, 64), (4096, 64, 1), 0); del buf45 # reuse
# Source Nodes: [bmm_7], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 256), reinterpret_tensor(buf69, (108, 192, 64), (12288, 64, 1), 0), out=buf70)
del buf69
buf71 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
# Source Nodes: [einsum_3], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 0), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 0), out=buf71)
buf72 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
# Source Nodes: [einsum_4], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 448), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 589824), out=buf72)
del buf67
buf74 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf75 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
cpp_fused_cat_6(arg14_1, buf74, buf75)
del buf73
del buf74
del buf75
del buf76
del buf77
buf79 = reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0); del buf43 # reuse
# Source Nodes: [bmm_8], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 540672), reinterpret_tensor(buf78, (12, 64, 448), (28672, 1, 64), 0), out=buf79)
buf83 = buf30; del buf30 # reuse
buf80 = reinterpret_tensor(buf83, (1, 1, 1, 64), (448, 448, 448, 1), 0) # alias
buf81 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 64) # alias
buf82 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 256) # alias
buf84 = reinterpret_tensor(buf86, (1, 12, 64, 256), (344064, 28672, 448, 1), 0) # alias
buf87 = buf36; del buf36 # reuse
buf88 = buf35; del buf35 # reuse
buf89 = buf34; del buf34 # reuse
buf96 = reinterpret_tensor(buf78, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf78 # reuse
buf91 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf92 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
cpp_fused__softmax_add_cat_minimum_mul_rsub_7(arg16_1, buf79, buf83, buf86, arg15_1, buf80, buf81, buf82, buf84, buf87, buf88, buf89, buf96, buf91, buf92)
del buf79
del buf80
del buf81
del buf82
del buf83
del buf84
del buf85
del buf86
del buf88
del buf90
del buf91
del buf92
del buf93
del buf94
buf97 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_9], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf96, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf95, (12, 448, 64), (28672, 64, 1), 0), out=buf97)
del buf95
del buf96
buf98 = reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0); del buf4 # reuse
# Source Nodes: [bmm_10], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 589824), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf98)
del arg12_1
del arg14_1
buf99 = buf89; del buf89 # reuse
buf100 = buf2; del buf2 # reuse
buf101 = buf87; del buf87 # reuse
buf102 = reinterpret_tensor(buf0, (1, 12, 64, 832), (638976, 53248, 832, 1), 0); del buf0 # reuse
cpp_fused__softmax_add_mul_rsub_8(buf98, arg16_1, buf99, buf100, buf101, buf102)
del arg16_1
del buf101
del buf98
del buf99
buf103 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_11], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf102, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf103)
del arg15_1
buf109 = reinterpret_tensor(buf102, (1, 12, 13, 64, 64), (638976, 53248, 4096, 64, 1), 0); del buf102 # reuse
buf104 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 0) # alias
buf105 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 4096) # alias
buf106 = reinterpret_tensor(buf109, (1, 12, 9, 64, 64), (638976, 53248, 4096, 64, 1), 8192) # alias
buf107 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 45056) # alias
buf108 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 49152) # alias
buf110 = reinterpret_tensor(buf100, (1, 12, 832, 64), (638976, 53248, 64, 1), 0); del buf100 # reuse
cpp_fused_cat_mul_9(buf5, buf44, buf68, buf70, buf71, buf72, buf97, buf103, buf109, arg18_1, buf104, buf105, buf106, buf107, buf108, buf110)
del arg18_1
return (reinterpret_tensor(buf110, (1, 832, 12, 64), (638976, 64, 53248, 1), 0), reinterpret_tensor(buf19, (1, 12, 11, 3), (396, 33, 3, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg1_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg2_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg3_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg4_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg5_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg6_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg7_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg8_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg9_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg10_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg11_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg12_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
arg13_1 = rand_strided((1, 13, 64), (832, 64, 1), device='cpu', dtype=torch.float32)
arg14_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
arg15_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
arg16_1 = rand_strided((1, 1, 1, 832), (832, 832, 832, 1), device='cpu', dtype=torch.float32)
arg17_1 = rand_strided((1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), device='cpu', dtype=torch.float32)
arg18_1 = rand_strided((1, 1, 832, 1), (832, 832, 1, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:09.567000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:09.568000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "81a28a443bd0d99705f0b5d2b9a46edc"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['bsz'], accessed_by=DictGetItemGuardAccessor(bsz)
| | +- EQUALS_MATCH: L['bsz'] == 1
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
| | +- EQUALS_MATCH: L['n_heads'] == 12
| +- GuardManager: source=L['rsqrt_d'], accessed_by=DictGetItemGuardAccessor(rsqrt_d)
| | +- EQUALS_MATCH: L['rsqrt_d'] == 0.125
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
| | +- LENGTH_CHECK: len(L['___stack0']) == 12
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['key_layer'], accessed_by=DictGetItemGuardAccessor(key_layer)
| | +- TENSOR_MATCH: check_tensor(L['key_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
| | +- NO_HASATTR: hasattr(L['key_layer'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['query_layer'], accessed_by=DictGetItemGuardAccessor(query_layer)
| | +- TENSOR_MATCH: check_tensor(L['query_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
| | +- NO_HASATTR: hasattr(L['query_layer'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['value_layer'], accessed_by=DictGetItemGuardAccessor(value_layer)
| | +- TENSOR_MATCH: check_tensor(L['value_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
| | +- NO_HASATTR: hasattr(L['value_layer'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['attn_mask_penalty'], accessed_by=DictGetItemGuardAccessor(attn_mask_penalty)
| | +- EQUALS_MATCH: L['attn_mask_penalty'] == -10000.0
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn)
| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680)
| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional)
| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024)
| | | | +- GuardManager: source=G['nn'].functional.softmax, accessed_by=GetAttrGuardAccessor(softmax)
| | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.softmax, 139842422997488)
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].stack, accessed_by=GetAttrGuardAccessor(stack)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].stack, 139844763318256)
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
| | | +- GuardManager: source=G['torch'].bmm, accessed_by=GetAttrGuardAccessor(bmm)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].bmm, 139845228834192)
| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672)
| | | +- GuardManager: source=G['torch'].div, accessed_by=GetAttrGuardAccessor(div)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].div, 139845228790304)
| | | +- GuardManager: source=G['torch'].long, accessed_by=GetAttrGuardAccessor(long)
| | | | +- EQUALS_MATCH: G['torch'].long == torch.int64
| | | +- GuardManager: source=G['torch'].stack, accessed_by=GetAttrGuardAccessor(stack)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].stack, 139845228799024)
| | | +- GuardManager: source=G['torch'].arange, accessed_by=GetAttrGuardAccessor(arange)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].arange, 139845228706960)
| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568)
| | | +- GuardManager: source=G['torch'].tensor, accessed_by=GetAttrGuardAccessor(tensor)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].tensor, 139845228703840)
| | | +- GuardManager: source=G['torch'].minimum, accessed_by=GetAttrGuardAccessor(minimum)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].minimum, 139845228824272)
| | | +- GuardManager: source=G['torch'].transpose, accessed_by=GetAttrGuardAccessor(transpose)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].transpose, 139845228736688)
| | +- GuardManager: source=G['__builtins_dict___46'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___46)
| | | +- GuardManager: source=G['__builtins_dict___46']['len'], accessed_by=DictGetItemGuardAccessor(len)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['len'], 139845257826832)
| | | +- GuardManager: source=G['__builtins_dict___46']['zip'], accessed_by=DictGetItemGuardAccessor(zip)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['zip'], 7491872)
| | | +- GuardManager: source=G['__builtins_dict___46']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['range'], 7632448)
| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:09.568000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "15/0", "frame_key": "20", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 583, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 58, "shape_env_guard_count": 0, "graph_op_count": 208, "graph_node_count": 228, "graph_input_count": 19, "start_time": 1719534664.260442, "entire_frame_compile_time_s": 5.308261871337891, "backend_compile_time_s": 5.101780414581299, "inductor_compile_time_s": 4.007972240447998, "code_gen_time_s": 3.5389716625213623, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.580000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 40, "size": 2555904}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 12, 64], "is_leaf": true, "stride": [638976, 64, 53248, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dbd80>", "describer_id": 40}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 40, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.587000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 12, 64], "contiguous": [1, 832, 12, 64], "context_layer": [1, 832, 768]}}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "153b3dc8bb7ea7326b02a24531cf2b23"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
l_stack0_0_ = L_stack0_0_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
contiguous: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l_stack0_0_.contiguous(); l_stack0_0_ = None
context_layer: "f32[1, 832, 768][638976, 768, 1]cpu" = contiguous.view(1, 832, -1); contiguous = None
return (context_layer,)
V0627 17:31:09.599000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "4088b7608c41845b848a0fa539961d1e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format); arg0_1 = None
view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(clone, [1, 832, -1]); clone = None
return (view,)
V0627 17:31:09.609000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "33da1fe849e643eaf3458df62aaeea7e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format); arg0_1 = None
view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.reshape.default(clone, [1, 832, -1]); clone = None
return (view,)
V0627 17:31:09.703000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/7l/c7lltvlss5l4w5dsp4k3kpmjg6nemqpgb5mrjqqw2csgjbuvtav3.py"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "675b3bf5875d915c125bff4b02eb31f4"}
# AOT ID: ['9_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused_clone_0 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
float* out_ptr0)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(12L); x1+=static_cast<long>(1L))
{
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x2 + (64L*x0) + (53248L*x1)), 16);
tmp0.store(out_ptr0 + static_cast<long>(x2 + (64L*x1) + (768L*x0)));
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (1, 832, 12, 64), (638976, 64, 53248, 1))
buf0 = empty_strided_cpu((1, 832, 12, 64), (638976, 768, 64, 1), torch.float32)
cpp_fused_clone_0(arg0_1, buf0)
del arg0_1
return (reinterpret_tensor(buf0, (1, 832, 768), (638976, 768, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((1, 832, 12, 64), (638976, 64, 53248, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg0_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:09.710000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:09.710000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "832b6bdf2f2092cb0e2ca7f3e3a30237"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 2
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 12, 64], stride=[638976, 64, 53248, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1)
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][1], 7636800)
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length)
| | +- EQUALS_MATCH: L['from_seq_length'] == 832
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
V0627 17:31:09.710000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "16/0", "frame_key": "21", "co_name": "torch_dynamo_resume_in_forward_at_472", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 11, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 4, "graph_input_count": 1, "start_time": 1719534669.5804062, "entire_frame_compile_time_s": 0.13004136085510254, "backend_compile_time_s": 0.12020564079284668, "inductor_compile_time_s": 0.09919452667236328, "code_gen_time_s": 0.08350419998168945, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.711000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30911a30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 42, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 42, "id": 4, "source": "L['hidden_states']"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.731000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "l_hidden_states_": [1, 832, 768], "hidden_states": [1, 832, 768], "hidden_states_1": [1, 832, 768], "add": [1, 832, 768], "hidden_states_2": [1, 832, 768]}}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "5cbaeaa3b94e9560f38738cbbbf2efd6"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu", L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_hidden_states_ = L_hidden_states_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
hidden_states: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(l_stack0_0_); l_stack0_0_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1317 in forward, code: hidden_states = self.dropout(hidden_states)
hidden_states_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states); hidden_states = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_1 + l_hidden_states_; hidden_states_1 = l_hidden_states_ = None
hidden_states_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add); add = None
return (hidden_states_2,)
V0627 17:31:09.771000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "1cd1232b8ea80a91453ce72d7309f42c"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768][1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1317 in forward, code: hidden_states = self.dropout(hidden_states)
clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_1); view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg5_1); clone = arg5_1 = None
var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1); add_1 = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1); add = getitem_1 = None
mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, arg2_1); mul = arg2_1 = None
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, arg3_1); mul_1 = arg3_1 = None
return (add_2,)
V0627 17:31:09.822000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "e2b95d7d56d3ed2a8ad6cfb284f41613"}
class <lambda>(torch.nn.Module):
def forward(self, arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# No stacktrace found for following nodes
_frozen_param2: "f32[768][1]cpu" = self._frozen_param2
_frozen_param3: "f32[768][1]cpu" = self._frozen_param3
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
_frozen_param4: "bf16[768][1]cpu" = self._frozen_param4
# No stacktrace found for following nodes
_frozen_param6: "bf16[768, 768][1, 0]cpu" = self._frozen_param6
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
_linear_pointwise_default_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param6, _frozen_param4, 'none', [], ''); convert_element_type_2 = _frozen_param6 = _frozen_param4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_1, arg5_1); _linear_pointwise_default_1 = arg5_1 = None
var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1); add = getitem_1 = None
add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1); add_1 = None
mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, _frozen_param2); mul = _frozen_param2 = None
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None
return (add_2,)
V0627 17:31:09.919000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ot/cotc6xdws22smodcitafp7uurqklfk4ux2ijtnzkqwktzn6c3wk3.py"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "320320d26970537cad9fa4b92420ab78"}
# AOT ID: ['10_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param2 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44fe0
_frozen_param3 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45080
_frozen_param4 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e30928a90
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e303a2cf0
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0);
tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
}
}
}
}
''')
cpp_fused_add_native_layer_norm_1 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
const float* in_ptr3,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
{
{
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps);
}
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp4 = out_ptr0[static_cast<long>(x0)];
auto tmp7 = out_ptr1[static_cast<long>(x0)];
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
auto tmp5 = at::vec::Vectorized<float>(tmp4);
auto tmp6 = tmp3 - tmp5;
auto tmp8 = static_cast<float>(768.0);
auto tmp9 = tmp7 / tmp8;
auto tmp10 = static_cast<float>(1e-12);
auto tmp11 = decltype(tmp9)(tmp9 + tmp10);
auto tmp12 = 1 / std::sqrt(tmp11);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp6 * tmp13;
auto tmp16 = tmp14 * tmp15;
auto tmp18 = tmp16 + tmp17;
tmp18.store(out_ptr2 + static_cast<long>(x1 + (768L*x0)));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg4_1, arg5_1 = args
args.clear()
assert_size_stride(arg4_1, (1, 832, 768), (638976, 768, 1))
assert_size_stride(arg5_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg4_1, buf0)
del arg4_1
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param6, _frozen_param4, 'none', [-1], '')
del buf0
buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf3 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf5 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
cpp_fused_add_native_layer_norm_1(buf1, arg5_1, _frozen_param2, _frozen_param3, buf2, buf3, buf5)
del arg5_1
return (buf5, )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param2
_frozen_param2 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param3
_frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param4
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param6
_frozen_param6 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
arg4_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
arg5_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg4_1, arg5_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:09.931000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:09.932000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "cc9600447bc28ad3ba928d7719c0654d"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272272)
| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__)
| | | | | | +- GuardManager: source=L['self'].output.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202271456)
| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].output.dense.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202271168)
| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].output.dropout.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202271504)
| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 1
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states'])
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks
V0627 17:31:09.932000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "17/0", "frame_key": "22", "co_name": "torch_dynamo_resume_in_forward_at_1401", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1401, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 34, "shape_env_guard_count": 0, "graph_op_count": 4, "graph_node_count": 7, "graph_input_count": 2, "start_time": 1719534669.711534, "entire_frame_compile_time_s": 0.22069621086120605, "backend_compile_time_s": 0.1933588981628418, "inductor_compile_time_s": 0.11173701286315918, "code_gen_time_s": 0.08121824264526367, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.933000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 44, "size": 2555904}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e309a5760>", "describer_id": 44}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 44, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.984000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "hidden_states": [1, 832, 3072], "mul": [1, 832, 3072], "pow_1": [1, 832, 3072], "mul_1": [1, 832, 3072], "add": [1, 832, 3072], "mul_2": [1, 832, 3072], "tanh": [1, 832, 3072], "add_1": [1, 832, 3072], "hidden_states_1": [1, 832, 3072], "hidden_states_2": [1, 832, 768], "hidden_states_3": [1, 832, 768], "add_2": [1, 832, 768], "hidden_states_4": [1, 832, 768]}}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "28432eb8c22b77d39d8eae55f0796aec"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_stack0_0_ = L_stack0_0_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
hidden_states: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = self.L__self___intermediate_dense(l_stack0_0_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.5 * hidden_states
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.pow(hidden_states, 3.0)
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.044715 * pow_1; pow_1 = None
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = hidden_states + mul_1; hidden_states = mul_1 = None
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.7978845608028654 * add; add = None
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.tanh(mul_2); mul_2 = None
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 1.0 + tanh; tanh = None
hidden_states_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = mul * add_1; mul = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
hidden_states_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(hidden_states_1); hidden_states_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1436 in forward, code: hidden_states = self.dropout(hidden_states)
hidden_states_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states_2); hidden_states_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_3 + l_stack0_0_; hidden_states_3 = l_stack0_0_ = None
hidden_states_4: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add_2); add_2 = None
return (hidden_states_4,)
V0627 17:31:10.051000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "c94939d327a02b378b1745a04171ca4e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[3072, 768][768, 1]cpu", arg1_1: "f32[3072][1]cpu", arg2_1: "f32[768, 3072][3072, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type: "bf16[3072][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[3072, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 3072][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 3072]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 0.5)
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(view_1, 3.0)
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715); pow_1 = None
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(view_1, mul_1); view_1 = mul_1 = None
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654); add = None
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2); mul_2 = None
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0); tanh = None
mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1); mul = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None
convert_element_type_7: "bf16[768, 3072][3072, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None
view_2: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.view.default(mul_3, [832, 3072]); mul_3 = None
permute_1: "bf16[3072, 768][1, 3072]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_2, permute_1); convert_element_type_6 = view_2 = permute_1 = None
view_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1436 in forward, code: hidden_states = self.dropout(hidden_states)
clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_3); view_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg6_1); clone = arg6_1 = None
var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3); add_3 = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1); add_2 = getitem_1 = None
mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, arg4_1); mul_4 = arg4_1 = None
add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, arg5_1); mul_5 = arg5_1 = None
return (add_4,)
V0627 17:31:10.133000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "8400618ae53b7968980ef85788f68b83"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# No stacktrace found for following nodes
_frozen_param4: "f32[768][1]cpu" = self._frozen_param4
_frozen_param5: "f32[768][1]cpu" = self._frozen_param5
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
_frozen_param6: "bf16[3072][1]cpu" = self._frozen_param6
# No stacktrace found for following nodes
_frozen_param10: "bf16[3072, 768][1, 0]cpu" = self._frozen_param10
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
# No stacktrace found for following nodes
_frozen_param11: "bf16[768, 3072][1, 0]cpu" = self._frozen_param11
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
_linear_pointwise_default_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param10, _frozen_param6, 'none', [], ''); convert_element_type_2 = _frozen_param10 = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(_linear_pointwise_default_3, 0.5)
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(_linear_pointwise_default_3, 3.0)
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715); pow_1 = None
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_3, mul_1); _linear_pointwise_default_3 = mul_1 = None
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654); add = None
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2); mul_2 = None
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0); tanh = None
mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1); mul = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
_linear_pointwise_default_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(mul_3, _frozen_param11, _frozen_param8, 'none', [], ''); mul_3 = _frozen_param11 = _frozen_param8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_2, arg6_1); _linear_pointwise_default_2 = arg6_1 = None
var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1); add_2 = getitem_1 = None
add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3); add_3 = None
mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, _frozen_param4); mul_4 = _frozen_param4 = None
add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, _frozen_param5); mul_5 = _frozen_param5 = None
return (add_4,)
V0627 17:31:10.240000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/yq/cyqi5vcdu2onzw25fkzgawphp3sm6xov6rt4wwjoshykrnlnqms3.py"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "220b8ade00d54ed30a9ebc3492a6ee4d"}
# AOT ID: ['11_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param4 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45300
_frozen_param5 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45350
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (3072,) (1,) 7f2e301a7600
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (3072, 768) (1, 0) 7f2e3013c8b0
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e3013f830
_frozen_param11 = None # device(type='cpu') torch.bfloat16 (768, 3072) (1, 0) 7f2e3013c2c0
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0);
tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
}
}
}
}
''')
cpp_fused_add_mul_pow_tanh_1 = async_compile.cpp_pybinding(['bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(bfloat16* in_out_ptr0)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2555904L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_out_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.5);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = tmp1 * tmp1;
auto tmp6 = tmp5 * tmp1;
auto tmp7 = static_cast<float>(0.044715);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp6 * tmp8;
auto tmp10 = tmp1 + tmp9;
auto tmp11 = static_cast<float>(0.7978845608028654);
auto tmp12 = at::vec::Vectorized<float>(tmp11);
auto tmp13 = tmp10 * tmp12;
auto tmp14 = decltype(tmp13)(2) / (decltype(tmp13)(1) + (decltype(tmp13)(-2) * tmp13).exp()) - decltype(tmp13)(1);
auto tmp15 = static_cast<float>(1.0);
auto tmp16 = at::vec::Vectorized<float>(tmp15);
auto tmp17 = tmp14 + tmp16;
auto tmp18 = tmp4 * tmp17;
auto tmp19 = at::vec::convert<bfloat16>(tmp18);
tmp19.store(in_out_ptr0 + static_cast<long>(x0), 16);
}
}
}
}
''')
cpp_fused_add_native_layer_norm_2 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
const float* in_ptr3,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
{
{
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps);
}
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
}
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp4 = out_ptr0[static_cast<long>(x0)];
auto tmp7 = out_ptr1[static_cast<long>(x0)];
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
auto tmp5 = at::vec::Vectorized<float>(tmp4);
auto tmp6 = tmp3 - tmp5;
auto tmp8 = static_cast<float>(768.0);
auto tmp9 = tmp7 / tmp8;
auto tmp10 = static_cast<float>(1e-12);
auto tmp11 = decltype(tmp9)(tmp9 + tmp10);
auto tmp12 = 1 / std::sqrt(tmp11);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp6 * tmp13;
auto tmp16 = tmp14 * tmp15;
auto tmp18 = tmp16 + tmp17;
tmp18.store(out_ptr2 + static_cast<long>(x1 + (768L*x0)));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg6_1, = args
args.clear()
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg6_1, buf0)
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param10, _frozen_param6, 'none', [-1], '')
del buf0
buf2 = buf1; del buf1 # reuse
cpp_fused_add_mul_pow_tanh_1(buf2)
buf3 = torch.ops.mkldnn._linear_pointwise(buf2, _frozen_param11, _frozen_param8, 'none', [-1], '')
del buf2
buf4 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf5 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf7 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
cpp_fused_add_native_layer_norm_2(buf3, arg6_1, _frozen_param4, _frozen_param5, buf4, buf5, buf7)
del arg6_1
return (buf7, )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param4
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param5
_frozen_param5 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param6
_frozen_param6 = rand_strided((3072, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param10
_frozen_param10 = rand_strided((3072, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param8
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param11
_frozen_param11 = rand_strided((768, 3072), (1, 0), device='cpu', dtype=torch.bfloat16)
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg6_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:10.257000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:10.258000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "f3efa14ea8c088430fc033af17fce04d"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272320)
| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__)
| | | | | | +- GuardManager: source=L['self'].output.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202267808)
| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].output.dense.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202268288)
| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].output.dropout.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202268912)
| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | | +- GuardManager: source=L['self'].intermediate, accessed_by=DictGetItemGuardAccessor(intermediate)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate, 139839202275440)
| | | | | +- GuardManager: source=L['self'].intermediate.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.__dict__)
| | | | | | +- GuardManager: source=L['self'].intermediate.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.training, 7685824)
| | | | | | +- GuardManager: source=L['self'].intermediate._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].intermediate.dense, accessed_by=DictGetItemGuardAccessor(dense)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.dense, 139839202270544)
| | | | | | | | +- GuardManager: source=L['self'].intermediate.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.dense.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.dense.training, 7685824)
| | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn, accessed_by=DictGetItemGuardAccessor(intermediate_act_fn)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.intermediate_act_fn, 139839202267616)
| | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.intermediate_act_fn.__dict__)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.intermediate_act_fn.training, 7685824)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | +- GuardManager: source=L['self'].is_decoder, accessed_by=DictGetItemGuardAccessor(is_decoder)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].is_decoder, 7685824)
| | | +- GuardManager: source=L['self'].seq_len_dim, accessed_by=DictGetItemGuardAccessor(seq_len_dim)
| | | | +- EQUALS_MATCH: L['self'].seq_len_dim == 1
| | | +- GuardManager: source=L['self'].chunk_size_feed_forward, accessed_by=DictGetItemGuardAccessor(chunk_size_feed_forward)
| | | | +- EQUALS_MATCH: L['self'].chunk_size_feed_forward == 0
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 1
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['apply_chunking_to_forward'], accessed_by=DictGetItemGuardAccessor(apply_chunking_to_forward)
| | | +- GuardManager: source=G['apply_chunking_to_forward'].__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | +- ID_MATCH: ___check_obj_id(G['apply_chunking_to_forward'].__code__, 139839646455872)
| | +- GuardManager: source=G['__builtins_dict___52'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___52)
| | | +- GuardManager: source=G['__builtins_dict___52']['len'], accessed_by=DictGetItemGuardAccessor(len)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___52']['len'], 139845257826832)
| | +- GuardManager: source=G['__import_transformers_dot_activations'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_activations)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'], 139839665031744)
| | | +- GuardManager: source=G['__import_transformers_dot_activations'].math, accessed_by=GetAttrGuardAccessor(math)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math, 139845236089744)
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.pi, accessed_by=GetAttrGuardAccessor(pi)
| | | | | +- EQUALS_MATCH: G['__import_transformers_dot_activations'].math.pi == 3.141592653589793
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math.sqrt, 139845236093344)
| | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.pow, accessed_by=GetAttrGuardAccessor(pow)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.pow, 139845228824512)
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.tanh, accessed_by=GetAttrGuardAccessor(tanh)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.tanh, 139845228799744)
| | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_pytorch_utils)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'], 139839703287984)
| | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect, accessed_by=GetAttrGuardAccessor(inspect)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect, 139845236517488)
| | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature, accessed_by=GetAttrGuardAccessor(signature)
| | | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, 139845231798640)
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks
V0627 17:31:10.259000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "18/0", "frame_key": "23", "co_name": "torch_dynamo_resume_in_forward_at_1488", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1488, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 69, "shape_env_guard_count": 0, "graph_op_count": 13, "graph_node_count": 15, "graph_input_count": 1, "start_time": 1719534669.9335542, "entire_frame_compile_time_s": 0.3254525661468506, "backend_compile_time_s": 0.26067519187927246, "inductor_compile_time_s": 0.1273505687713623, "code_gen_time_s": 0.07860469818115234, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:10.262000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 46, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 46, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 46, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.306000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 47, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.307000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.307000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 47, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 47, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.310000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.311000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.312000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.312000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.317000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:10.317000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "9d228664307649151c1145ad228290a7"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274768)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202265168)
| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].attention.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention.training, 7685824)
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value)
| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask)
| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)
V0627 17:31:10.318000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "7/1", "frame_key": "24", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534670.2629929, "entire_frame_compile_time_s": 0.05506253242492676, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.04132270812988281, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.318000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 48, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 48, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 48, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.325000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.325000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 49, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 49, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.357000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 49, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.357000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.358000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.358000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.360000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.360000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.362000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.362000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.365000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"}
class GraphModule(torch.nn.Module):
def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
l_to_mask_ = L_to_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1383 in forward, code: band_mask = band_mask.to(hidden_states.dtype)
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = l_band_mask_.to(torch.float32); l_band_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1385 in forward, code: from_mask = from_mask.to(hidden_states.dtype)
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_from_mask_.to(torch.float32); l_from_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1387 in forward, code: to_mask = to_mask.to(hidden_states.dtype)
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_to_mask_.to(torch.float32); l_to_mask_ = None
return (band_mask, from_mask, to_mask)
V0627 17:31:10.380000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
return (arg0_1, arg1_1, arg2_1)
V0627 17:31:10.392000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:10.392000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "78c6200e495d09cd995b82c1e530d62e"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202265168)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202264976)
| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].self.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].self.training, 7685824)
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:10.392000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "8/1", "frame_key": "25", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534670.3189635, "entire_frame_compile_time_s": 0.07366013526916504, "backend_compile_time_s": 0.02211451530456543, "inductor_compile_time_s": 0.00025773048400878906, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03475379943847656, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.393000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 51, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 51}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 51, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 52, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 52, "size": 442368}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 52, "size": 3328}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.437000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.437000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.438000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.438000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.441000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"}
class GraphModule(torch.nn.Module):
def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_hidden_states_ = L_hidden_states_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64); l__self___query = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3); x = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64); l__self___key = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3); x_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_); l_hidden_states_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64); l__self___value = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3); x_2 = None
return (query_layer, key_layer, value_layer)
V0627 17:31:10.491000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]); view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None
convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None
convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]); convert_element_type_8 = None
permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2); convert_element_type_6 = view_3 = permute_2 = None
view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]); view_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16); arg5_1 = None
convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]); convert_element_type_14 = None
permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]); convert_element_type_13 = None
addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4); convert_element_type_12 = view_6 = permute_4 = None
view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]); addmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]); view_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:10.557000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
_frozen_param6: "bf16[768][1]cpu" = self._frozen_param6
# No stacktrace found for following nodes
_frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
# No stacktrace found for following nodes
_frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_frozen_param10: "bf16[768][1]cpu" = self._frozen_param10
# No stacktrace found for following nodes
_frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
_linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:10.578000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/wm/cwm7ec52zxt6bl7gt2h7sahtj5wsw4g7ez4jvozekjwtw7nqdl3v.py"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "c068758cb8977ae26fcf611c09070a9a"}
# AOT ID: ['13_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300d5490
_frozen_param12 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300daed0
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a0c70
_frozen_param13 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300a3e70
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a1490
_frozen_param14 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300dbfb0
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0);
tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg6_1, = args
args.clear()
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg6_1, buf0)
del arg6_1
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param6
_frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param12
_frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param8
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param13
_frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param10
_frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param14
_frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg6_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:10.587000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:10.588000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "da04fa8fdd18f2f15ae08b9dbbb492e0"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202265648)
| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].key.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].key.training, 7685824)
| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202265696)
| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].query.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].query.training, 7685824)
| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202264592)
| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'].value.training, accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].value.training, 7685824)
| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed)
| | | | +- EQUALS_MATCH: L['self'].seed == 1
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
| | | | +- EQUALS_MATCH: L['self'].block_size == 64
| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks)
| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3
| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64
| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads)
| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:10.588000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "9/1", "frame_key": "26", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534670.3936255, "entire_frame_compile_time_s": 0.19471240043640137, "backend_compile_time_s": 0.1402432918548584, "inductor_compile_time_s": 0.033010005950927734, "code_gen_time_s": 0.012862920761108398, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.024178743362426758, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.589000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.614000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:10.614000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "1ea07e64f0c0d490d94336fa323c05e9"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed)
| | +- EQUALS_MATCH: L['seed'] == 1
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | +- EQUALS_MATCH: L['attention_head_size'] == 64
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464)
| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed)
| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264)
| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math)
| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744)
| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344)
V0627 17:31:10.615000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "10/1", "frame_key": "27", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 516, "cache_size": 1, "accumulated_cache_size": 1, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.5898829, "entire_frame_compile_time_s": 0.02506852149963379, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.009800434112548828, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.615000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.647000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:10.648000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "b6b8c289bd494c29f862b3959f02ec26"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
| | +- EQUALS_MATCH: L['n_heads'] == 12
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___69'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___69)
| | | +- GuardManager: source=G['__builtins_dict___69']['int'], accessed_by=DictGetItemGuardAccessor(int)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___69']['int'], 7648640)
V0627 17:31:10.648000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "11/1", "frame_key": "28", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.6159284, "entire_frame_compile_time_s": 0.03219175338745117, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.01743292808532715, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.649000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300921b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30091ee0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30092c00>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30152570>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0040>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0950>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0680>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1530>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1e90>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc2840>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.673000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.673000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc30b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.674000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.675000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.676000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3470>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.676000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.681000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None))); l_stack0_0_ = None
wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None))); l_stack0_1_ = None
wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None))); l_stack0_2_ = None
wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None))); l_stack0_3_ = None
wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None))); l_stack0_4_ = None
wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None))); l_stack0_5_ = None
wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None))); l_stack0_6_ = None
wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None))); l_stack0_7_ = None
wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None))); l_stack0_8_ = None
wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None))); l_stack0_9_ = None
wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None))); l_stack0_10_ = None
wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None))); l_stack0_11_ = None
return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11)
V0627 17:31:10.751000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807); slice_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None
return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24)
V0627 17:31:10.788000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0627 17:31:10.802000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/of/cof3htzjwffvxd2lla7sn2ozynci436rdmah5vsvllsahmxz6qro.py"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "c06d796ae11c7e77048735efc71e26ca"}
# AOT ID: ['14_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
args.clear()
assert_size_stride(arg0_1, (13, 3), (3, 1))
assert_size_stride(arg1_1, (13, 3), (3, 1))
assert_size_stride(arg2_1, (13, 3), (3, 1))
assert_size_stride(arg3_1, (13, 3), (3, 1))
assert_size_stride(arg4_1, (13, 3), (3, 1))
assert_size_stride(arg5_1, (13, 3), (3, 1))
assert_size_stride(arg6_1, (13, 3), (3, 1))
assert_size_stride(arg7_1, (13, 3), (3, 1))
assert_size_stride(arg8_1, (13, 3), (3, 1))
assert_size_stride(arg9_1, (13, 3), (3, 1))
assert_size_stride(arg10_1, (13, 3), (3, 1))
assert_size_stride(arg11_1, (13, 3), (3, 1))
return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:10.810000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
[
]
V0627 17:31:10.810000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "6e13f24b700fd79116617b1177bb6706"}
TREE_GUARD_MANAGER:
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/output_graph.py:460 in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
| | +- LENGTH_CHECK: len(L['___stack0']) == 12
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
| | +- EQUALS_MATCH: L['num_heads'] == 12
| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks)
| | +- EQUALS_MATCH: L['num_blocks'] == 13
| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top)
| | +- EQUALS_MATCH: L['global_block_top'] == 1
| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom)
| | +- EQUALS_MATCH: L['global_block_bottom'] == 1
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___71'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___71)
| | | +- GuardManager: source=G['__builtins_dict___71']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___71']['range'], 7632448)
V0627 17:31:10.810000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "14/1", "frame_key": "29", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534670.6493185, "entire_frame_compile_time_s": 0.16145634651184082, "backend_compile_time_s": 0.12227082252502441, "inductor_compile_time_s": 0.022518634796142578, "code_gen_time_s": 0.0035479068756103516, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.811000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff64360>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5e020>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5f5b0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300934c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff46cf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff44d60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffb1a80>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300dbbf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30090bd0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff67920>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3010>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc20c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 12, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 12, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014c20>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 12, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.833000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 13, "describer_id": 60, "size": 3328}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 14, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 13, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 14, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 15, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014220>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 15, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.841000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 15, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.842000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30015d50>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.842000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 16, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.856000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 17, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.856000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 17, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 16, "describer_id": 60, "size": 442368}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 18, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 18, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.936000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 19, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.936000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 19, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.952000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
l_query_layer_ = L_query_layer_
l_from_blocked_mask_ = L_from_blocked_mask_
l_key_layer_ = L_key_layer_
l_value_layer_ = L_value_layer_
l_to_mask_ = L_to_mask_
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0); l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64); rand_attn = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0)
rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.cat([rand_attn_1], dim = 0); rand_attn_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0]
i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
flatten: "i64[396][1]cpu" = i1.flatten(); i1 = None
getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten]; p1 = flatten = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]); getitem_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192); rand_mask = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-pa
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment