Skip to content

Instantly share code, notes, and snippets.

Created June 28, 2024 00:32
Show Gist options
  • Save leslie-fang-intel/193bb1ec096e619ff441484f94a0e2a3 to your computer and use it in GitHub Desktop.
Save leslie-fang-intel/193bb1ec096e619ff441484f94a0e2a3 to your computer and use it in GitHub Desktop.
trace log for 128513
This file has been truncated, but you can view the full file.
V0627 17:31:00.663000 139845268738432 torch/_logging/] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/", 0]}
V0627 17:31:00.663000 139845268738432 torch/_logging/] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/", 1]}
V0627 17:31:00.663000 139845268738432 torch/_logging/] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/", 2]}
V0627 17:31:00.663000 139845268738432 torch/_logging/] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/", 3]}
V0627 17:31:00.663000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.691000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 0, "size": 6552}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.692000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.692000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 0, "id": 0, "source": "L['inputs'][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 0, "size": 32768}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 0, "id": 1, "source": "L['mod'].bert.embeddings.token_type_ids"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.718000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "8f3f91fb1d48d67b1336de49ea694c74"}
class GraphModule(torch.nn.Module):
def forward(self):
# No stacktrace found for following nodes
_enter_autocast = torch.amp.autocast_mode._enter_autocast('cpu', None, True, None)
_exit_autocast = torch.amp.autocast_mode._exit_autocast(_enter_autocast); _enter_autocast = None
return ()
V0627 17:31:01.398000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "845c30ca0008a08ec62276cecc47183b"}
class <lambda>(torch.nn.Module):
def forward(self):
return ()
V0627 17:31:01.498000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:01.498000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "40c07a4da7b433b5416cc93985646719"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['mod'], accessed_by=DictGetItemGuardAccessor(mod)
| | +- ID_MATCH: ___check_obj_id(L['mod'], 139839714901824)
| | +- GuardManager: source=L['mod'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['mod'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['mod'].training, 7685824)
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- TYPE_MATCH: ___check_type_id(L['self'], 139842378438672)
| | +- GuardManager: source=L['self'].autocast, accessed_by=GetAttrGuardAccessor(autocast)
| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast, 139845255007760)
| | | +- GuardManager: source=L['self'].autocast.args, accessed_by=GetAttrGuardAccessor(args)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.args, 7625984)
| | | | +- LENGTH_CHECK: not L['self'].autocast.args
| | | +- GuardManager: source=L['self'].autocast.func, accessed_by=GetAttrGuardAccessor(func)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].autocast.func, 139844826956816)
| | | +- GuardManager: source=L['self'].autocast.keywords, accessed_by=GetAttrGuardAccessor(keywords)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.keywords, 7646656)
| | | | +- GuardManager: source=L['self'].autocast.keywords['device_type'], accessed_by=DictGetItemGuardAccessor(device_type)
| | | | | +- EQUALS_MATCH: L['self'].autocast.keywords['device_type'] == 'cpu'
| | +- GuardManager: source=L['self'].autocast_arg, accessed_by=GetAttrGuardAccessor(autocast_arg)
| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast_arg, 7646656)
| | | +- DICT_LENGTH: not L['self'].autocast_arg
| +- GuardManager: source=L['inputs'], accessed_by=DictGetItemGuardAccessor(inputs)
| | +- TYPE_MATCH: ___check_type_id(L['inputs'], 7625984)
| | +- LENGTH_CHECK: len(L['inputs']) == 1
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___1'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___1)
| | | +- GuardManager: source=G['__builtins_dict___1']['dict'], accessed_by=DictGetItemGuardAccessor(dict)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['dict'], 7646656)
| | | +- GuardManager: source=G['__builtins_dict___1']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['isinstance'], 139845257826512)
V0627 17:31:01.498000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "forward_pass", "co_filename": "/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/", "co_firstlineno": 425, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534660.6636841, "entire_frame_compile_time_s": 0.8347411155700684, "backend_compile_time_s": 0.7748816013336182, "inductor_compile_time_s": 0.00018596649169921875, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.04396843910217285, "has_guarded_code": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.500000 139845268738432 torch/_logging/] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/nn/modules/", 4]}
V0627 17:31:01.500000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.513000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 6, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.513000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.514000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 6, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 6, "size": 32768}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 6, "id": 1, "source": "L['self'].bert.embeddings.token_type_ids"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 7, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 7}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 7, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.535000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:01.535000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "6017f86a7c776c49ca1dd7d3539605bb"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901824)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].bert, accessed_by=DictGetItemGuardAccessor(bert)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].bert, 139839714901584)
| | | | | +- GuardManager: source=L['self'].bert.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7636800)
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800)
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)
V0627 17:31:01.535000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "1/0", "frame_key": "6", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 2382, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534661.5002189, "entire_frame_compile_time_s": 0.03560638427734375, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.022696733474731445, "has_guarded_code": true}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.536000 139845268738432 torch/_logging/] {"str": ["/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", 5]}
V0627 17:31:01.536000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 8, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 8, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 8, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 8, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 9, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 9, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.562000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 9, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.563000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.563000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 9, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.566000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"attention_mask": [1, 819], "l__self___embeddings_token_type_ids": [1, 4096], "buffered_token_type_ids": [1, 819], "buffered_token_type_ids_expanded": [1, 819]}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "6b7bec0701d22225fb67e6f1bfb9dc36"}
class GraphModule(torch.nn.Module):
def forward(self):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
attention_mask: "f32[1, 819][819, 1]cpu" = torch.ones((1, 819), device = device(type='cpu'))
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
l__self___embeddings_token_type_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_token_type_ids
buffered_token_type_ids: "i64[1, 819][4096, 1]cpu" = l__self___embeddings_token_type_ids[(slice(None, None, None), slice(None, 819, None))]; l__self___embeddings_token_type_ids = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
buffered_token_type_ids_expanded: "i64[1, 819][4096, 1]cpu" = buffered_token_type_ids.expand(1, 819); buffered_token_type_ids = None
return (attention_mask, buffered_token_type_ids_expanded)
V0627 17:31:01.581000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "1b3fb2899c356f991117f2262727f0ef"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
full: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
slice_1: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 0, 9223372036854775807); arg0_1 = None
slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 819); slice_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None
return (full, expand)
V0627 17:31:01.707000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "4a69dc4d0dfb43287c6abf210e06617e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
full_default: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819); arg0_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None
return (full_default, expand)
V0627 17:31:02.787000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ko/"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0244b4281966e5f52ba168279eb45118"}
# AOT ID: ['1_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused_ones_0 = async_compile.cpp_pybinding(['float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(float* out_ptr0)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(816L); x0+=static_cast<long>(16L))
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0); + static_cast<long>(x0));
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(816L); x0<static_cast<long>(819L); x0+=static_cast<long>(1L))
auto tmp0 = static_cast<float>(1.0);
out_ptr0[static_cast<long>(x0)] = tmp0;
del async_compile
def call(args):
arg0_1, = args
assert_size_stride(arg0_1, (1, 4096), (4096, 1))
buf0 = empty_strided_cpu((1, 819), (819, 1), torch.float32)
return (buf0, reinterpret_tensor(arg0_1, (1, 819), (4096, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((1, 4096), (4096, 1), device='cpu', dtype=torch.int64)
fn = lambda: call([arg0_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:02.814000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:02.815000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0b1c2f71c2e67149726041714c77db6e"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208)
| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers)
| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_ids, accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_ids, 139838528701520)
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856)
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800)
| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values)
| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800)
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
| | | +- GuardManager: source=G['torch'].ones, accessed_by=GetAttrGuardAccessor(ones)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].ones, 139845228734288)
| | +- GuardManager: source=G['__import_torch'], accessed_by=DictGetItemGuardAccessor(__import_torch)
| | | +- GuardManager: source=G['__import_torch'].fx, accessed_by=GetAttrGuardAccessor(fx)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx, 139842407409488)
| | | | +- GuardManager: source=G['__import_torch'].fx.Proxy, accessed_by=GetAttrGuardAccessor(Proxy)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx.Proxy, 139842429035536)
| | | +- GuardManager: source=G['__import_torch']._dynamo, accessed_by=GetAttrGuardAccessor(_dynamo)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo, 139839776121264)
| | | | +- GuardManager: source=G['__import_torch']._dynamo.is_compiling, accessed_by=GetAttrGuardAccessor(is_compiling)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo.is_compiling, 139839726529856)
| | +- GuardManager: source=G['__builtins_dict___9'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___9)
| | | +- GuardManager: source=G['__builtins_dict___9']['hasattr'], accessed_by=DictGetItemGuardAccessor(hasattr)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['hasattr'], 139845257826112)
| | | +- GuardManager: source=G['__builtins_dict___9']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['isinstance'], 139845257826512)
| | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_modeling_utils)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'], 139839661201088)
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit, accessed_by=GetAttrGuardAccessor(jit)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit, 139842414949968)
| | | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, accessed_by=GetAttrGuardAccessor(is_tracing)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, 139842413687088)
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy, accessed_by=GetAttrGuardAccessor(is_torch_fx_proxy)
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, 139839683265264)
| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling, accessed_by=GetAttrGuardAccessor(is_torchdynamo_compiling)
| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, 139839683236192)
| | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_utils_dot_import_utils)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'], 139839683217824)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_available, accessed_by=GetAttrGuardAccessor(_torch_available)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_available, 7685856)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available, accessed_by=GetAttrGuardAccessor(is_torch_available)
| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, 139839683197424)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, accessed_by=GetAttrGuardAccessor(_torch_fx_available)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, 7685856)
| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available, accessed_by=GetAttrGuardAccessor(is_torch_fx_available)
| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, 139839683233376)
V0627 17:31:02.815000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "2/0", "frame_key": "7", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1970, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 39, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 5, "graph_input_count": 0, "start_time": 1719534661.536575, "entire_frame_compile_time_s": 1.2790420055389404, "backend_compile_time_s": 1.2300312519073486, "inductor_compile_time_s": 1.2066993713378906, "code_gen_time_s": 1.083174467086792, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.014907121658325195, "has_guarded_code": true}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.816000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 12, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 12}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 12, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 13, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 13}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 13, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.837000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:02.837000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "fce4fac5f9230c475246dd6dd52e1c05"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['logger'], accessed_by=DictGetItemGuardAccessor(logger)
| | | +- ID_MATCH: ___check_obj_id(G['logger'], 139839664782448)
V0627 17:31:02.837000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "3/0", "frame_key": "8", "co_name": "_pad_to_block_size", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 2208, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 9, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534662.816827, "entire_frame_compile_time_s": 0.0205228328704834, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.007956266403198242, "has_guarded_code": true}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.838000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2226, "name": "_pad_to_block_size", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 14, "size": 6552}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 14, "id": 0, "source": "L['input_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.843000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 14, "size": 3276}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.844000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31f99710>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.844000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 14, "id": 2, "source": "L['attention_mask']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 14, "size": 32768}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "is_view": true, "stride": [4096, 1], "storage": 2, "base": 5, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ed9f15fd0>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 14, "id": 4, "source": "L['token_type_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.850000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_input_ids_": [1, 819], "l_attention_mask_": [1, 819], "l_token_type_ids_": [1, 819], "input_ids": [1, 832], "attention_mask": [1, 832], "token_type_ids": [1, 832]}}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "e7d86ff372082e962b35557ebd7308fc"}
class GraphModule(torch.nn.Module):
def forward(self, L_input_ids_: "i64[1, 819][819, 1]cpu", L_attention_mask_: "f32[1, 819][819, 1]cpu", L_token_type_ids_: "i64[1, 819][4096, 1]cpu"):
l_input_ids_ = L_input_ids_
l_attention_mask_ = L_attention_mask_
l_token_type_ids_ = L_token_type_ids_
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
input_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_input_ids_, (0, 13), 'constant', 0); l_input_ids_ = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
attention_mask: "f32[1, 832][832, 1]cpu" = torch._C._nn.pad(l_attention_mask_, (0, 13), 'constant', False); l_attention_mask_ = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
token_type_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_token_type_ids_, (0, 13), 'constant', 0); l_token_type_ids_ = None
return (input_ids, attention_mask, token_type_ids)
V0627 17:31:02.865000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None
return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0627 17:31:02.875000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None
# File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/ in pad, code: return torch._C._nn.pad(input, pad, mode, value)
constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None
return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0627 17:31:02.904000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/fj/"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "3758e3875a0e606fcec57aeffa852874"}
# AOT ID: ['2_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused_constant_pad_nd_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'int64_t*', 'float*', 'int64_t*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int64_t* in_ptr0,
const float* in_ptr1,
const int64_t* in_ptr2,
int64_t* out_ptr0,
float* out_ptr1,
int64_t* out_ptr2)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
auto tmp0 = x0;
auto tmp1 = c10::convert<int32_t>(tmp0);
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
auto tmp3 = static_cast<int32_t>(819);
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
auto tmp6 = [&]
auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr0 + static_cast<long>(x0));
return tmp7;
auto tmp10 =
if (tmp5.all_zero())
return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
auto tmp8 = tmp6();
auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
; + static_cast<long>(x0), 16);
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
auto tmp0 = x0;
auto tmp1 = c10::convert<int32_t>(tmp0);
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
auto tmp3 = static_cast<int32_t>(819);
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
auto tmp6 = [&]
auto tmp7 = tmp5.template cast<float,1>().template loadu<float,1>(in_ptr1 + static_cast<long>(x0));
return tmp7;
auto tmp10 =
if (tmp5.all_zero())
return at::vec::Vectorized<float>(static_cast<float>(0.0));
auto tmp8 = tmp6();
auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(0.0));
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<float,1>());
; + static_cast<long>(x0));
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
auto tmp0 = x0;
auto tmp1 = c10::convert<int32_t>(tmp0);
auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
auto tmp3 = static_cast<int32_t>(819);
auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
auto tmp6 = [&]
auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr2 + static_cast<long>(x0));
return tmp7;
auto tmp10 =
if (tmp5.all_zero())
return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
auto tmp8 = tmp6();
auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
; + static_cast<long>(x0), 16);
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1 = args
assert_size_stride(arg0_1, (1, 819), (819, 1))
assert_size_stride(arg1_1, (1, 819), (819, 1))
assert_size_stride(arg2_1, (1, 819), (4096, 1))
buf0 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
buf1 = empty_strided_cpu((1, 832), (832, 1), torch.float32)
buf2 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
cpp_fused_constant_pad_nd_0(arg0_1, arg1_1, arg2_1, buf0, buf1, buf2)
del arg0_1
del arg1_1
del arg2_1
return (buf0, buf1, buf2, )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.int64)
arg1_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.float32)
arg2_1 = rand_strided((1, 819), (4096, 1), device='cpu', dtype=torch.int64)
fn = lambda: call([arg0_1, arg1_1, arg2_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:02.910000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:02.911000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "ae71e2a61c1f7e9b1434b71d14d096e3"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
| +- GuardManager: source=L['padding_len'], accessed_by=DictGetItemGuardAccessor(padding_len)
| | +- EQUALS_MATCH: L['padding_len'] == 13
| +- GuardManager: source=L['pad_token_id'], accessed_by=DictGetItemGuardAccessor(pad_token_id)
| | +- EQUALS_MATCH: L['pad_token_id'] == 0
| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- TENSOR_MATCH: check_tensor(L['attention_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
| | +- NO_HASATTR: hasattr(L['attention_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
| | +- TENSOR_MATCH: check_tensor(L['token_type_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[4096, 1])
| | +- NO_HASATTR: hasattr(L['token_type_ids'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn)
| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680)
| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional)
| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024)
| | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=GetAttrGuardAccessor(pad)
| | | | | +- GuardManager: source=G['nn'].functional.pad.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.pad.__code__, 139842439629440)
| | | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=FuncDefaultsGuardAccessor
| | | | | | +- GuardManager: source=G['nn'].functional.pad.__defaults__[0], accessed_by=GetItemGuardAccessor(0)
| | | | | | | +- EQUALS_MATCH: G['nn'].functional.pad.__defaults__[0] == 'constant'
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_functional)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'], 139842441627024)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn, accessed_by=GetAttrGuardAccessor(_nn)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn, 139842445377216)
| | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, accessed_by=GetAttrGuardAccessor(pad)
| | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, 139842445416928)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit, accessed_by=GetAttrGuardAccessor(jit)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit, 139842414949968)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, accessed_by=GetAttrGuardAccessor(is_scripting)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, 139842422983696)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, accessed_by=GetAttrGuardAccessor(are_deterministic_algorithms_enabled)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, 139842451619504)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, accessed_by=GetAttrGuardAccessor(has_torch_function_unary)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, 139845228559104)
V0627 17:31:02.911000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "4/0", "frame_key": "9", "co_name": "torch_dynamo_resume_in__pad_to_block_size_at_2226", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 2226, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 26, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534662.838091, "entire_frame_compile_time_s": 0.07323813438415527, "backend_compile_time_s": 0.05719876289367676, "inductor_compile_time_s": 0.03380870819091797, "code_gen_time_s": 0.027545690536499023, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.912000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 16, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 16, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 16, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.916000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.916000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 16, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 3, "describer_id": 16, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 16, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 16, "id": 16, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.997000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 17, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 17, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 17, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 17, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 3, "describer_id": 17, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 17, "id": 3, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.027000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_1_": [1, 832], "l_stack0_2_": [1, 832], "l_stack0_3_": [1, 832], "blocked_encoder_mask": [1, 13, 64], "getitem": [1, 9, 64], "getitem_1": [1, 9, 64], "getitem_2": [1, 9, 64], "exp_blocked_to_pad": [1, 9, 192], "getitem_3": [1, 9, 64], "band_mask": [1, 1, 9, 64, 192], "unsqueeze_": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832], "l__self___embeddings_position_ids": [1, 4096], "position_ids": [1, 832], "inputs_embeds": [1, 832, 768], "token_type_embeddings": [1, 832, 768], "embeddings": [1, 832, 768], "position_embeddings": [1, 832, 768], "embeddings_1": [1, 832, 768], "embeddings_2": [1, 832, 768], "embeddings_3": [1, 832, 768]}}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "5bf8fff16cea4127a0a6b6a6800ef31a"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_1_: "i64[1, 832][832, 1]cpu", L_stack0_2_: "f32[1, 832][832, 1]cpu", L_stack0_3_: "i64[1, 832][832, 1]cpu"):
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
blocked_encoder_mask: "f32[1, 13, 64][832, 64, 1]cpu" = l_stack0_2_.view(1, 13, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
getitem: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(1, -3, None))]
getitem_1: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))]
getitem_2: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(3, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: exp_blocked_to_pad =
exp_blocked_to_pad: "f32[1, 9, 192][1728, 192, 1]cpu" =[getitem, getitem_1, getitem_2], dim = 2); getitem = getitem_1 = getitem_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
getitem_3: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))]
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.functional.einsum('blq,blk->blqk', getitem_3, exp_blocked_to_pad); getitem_3 = exp_blocked_to_pad = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
unsqueeze_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = band_mask.unsqueeze_(1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_stack0_2_.view(1, 1, 832, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_stack0_2_.view(1, 1, 1, 832); l_stack0_2_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
l__self___embeddings_position_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_position_ids
position_ids: "i64[1, 832][4096, 1]cpu" = l__self___embeddings_position_ids[(slice(None, None, None), slice(0, 832, None))]; l__self___embeddings_position_ids = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: inputs_embeds = self.word_embeddings(input_ids)
inputs_embeds: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_word_embeddings(l_stack0_1_); l_stack0_1_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
token_type_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_token_type_embeddings(l_stack0_3_); l_stack0_3_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = inputs_embeds + token_type_embeddings
embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = inputs_embeds + token_type_embeddings; inputs_embeds = token_type_embeddings = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: position_embeddings = self.position_embeddings(position_ids)
position_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_position_embeddings(position_ids); position_ids = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings += position_embeddings
embeddings += position_embeddings; embeddings_1: "f32[1, 832, 768][638976, 768, 1]cpu" = embeddings; embeddings = position_embeddings = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = self.dropout(embeddings)
embeddings_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_dropout(embeddings_1); embeddings_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = self.LayerNorm(embeddings)
embeddings_3: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_LayerNorm(embeddings_2); embeddings_2 = None
return (embeddings_3, band_mask, from_mask, to_mask, blocked_encoder_mask)
V0627 17:31:03.091000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f7b6ff7875cdbc7ff1ea7b5f6bc39ed2"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[50358, 768][768, 1]cpu", arg1_1: "f32[2, 768][768, 1]cpu", arg2_1: "f32[4096, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "i64[1, 4096][4096, 1]cpu", arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 13, 64])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -3); slice_1 = None
slice_3: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 2, -2); slice_3 = None
slice_5: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 3, -1); slice_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: exp_blocked_to_pad =
cat: "f32[1, 9, 192][1728, 192, 1]cpu" =[slice_2, slice_4, slice_6], 2); slice_2 = slice_4 = slice_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
slice_7: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
slice_8: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 2, -2); slice_7 = None
unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_8, 3); slice_8 = None
permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None
unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None
permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None
mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 832, 1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
slice_9: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 0, 9223372036854775807); arg5_1 = None
slice_10: "i64[1, 832][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 832); slice_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: inputs_embeds = self.word_embeddings(input_ids)
embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg0_1, arg6_1, 0); arg0_1 = arg6_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg1_1, arg8_1); arg1_1 = arg8_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = inputs_embeds + token_type_embeddings
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: position_embeddings = self.position_embeddings(position_ids)
embedding_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg2_1, slice_10); arg2_1 = slice_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings += position_embeddings
add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, embedding_2); add = embedding_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = self.dropout(embeddings)
clone: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(add_1); add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = self.LayerNorm(embeddings)
var_mean = torch.ops.aten.var_mean.correction(clone, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2); add_2 = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(clone, getitem_1); clone = getitem_1 = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, arg3_1); mul_1 = arg3_1 = None
add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, arg4_1); mul_2 = arg4_1 = None
return (add_3, unsqueeze_2, view_1, view_2, view)
V0627 17:31:03.161000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "06e74f82fcfa9d791dc26355727799db"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
# No stacktrace found for following nodes
_frozen_param0: "f32[50358, 768][768, 1]cpu" = self._frozen_param0
_frozen_param1: "f32[2, 768][768, 1]cpu" = self._frozen_param1
_frozen_param3: "f32[768][1]cpu" = self._frozen_param3
_frozen_param4: "f32[768][1]cpu" = self._frozen_param4
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: position_embeddings = self.position_embeddings(position_ids)
_frozen_param6: "f32[1, 832, 768][638976, 768, 1]cpu" = self._frozen_param6
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: inputs_embeds = self.word_embeddings(input_ids)
embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0); _frozen_param0 = arg6_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1); _frozen_param1 = arg8_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = inputs_embeds + token_type_embeddings
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings += position_embeddings
add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, _frozen_param6); add = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: embeddings = self.LayerNorm(embeddings)
var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_1, getitem_1); add_1 = getitem_1 = None
add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2); add_2 = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None
add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4); mul_2 = _frozen_param4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 2, -2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_4, 3)
permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 1, -3)
slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 3, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: exp_blocked_to_pad =
cat: "f32[1, 9, 192][1728, 192, 1]cpu" =[slice_2, slice_4, slice_6], 2); slice_2 = slice_4 = slice_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None
permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None
mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None
return (add_3, unsqueeze_2, view_1, view_2, view)
V0627 17:31:03.519000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/de/"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f30f2b373864eaff49baf96db8ab8cb7"}
# AOT ID: ['3_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param0 = None # device(type='cpu') torch.float32 (50358, 768) (768, 1) 7f2eb1d44630
_frozen_param1 = None # device(type='cpu') torch.float32 (2, 768) (768, 1) 7f2eb1d445e0
_frozen_param3 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44540
_frozen_param4 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44810
_frozen_param6 = None # device(type='cpu') torch.float32 (1, 832, 768) (638976, 768, 1) 7f2e3165ccc0
cpp_fused_add_cat_embedding_mul_native_layer_norm_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int64_t* in_ptr0,
const float* in_ptr1,
const int64_t* in_ptr2,
const float* in_ptr3,
const float* in_ptr4,
const float* in_ptr5,
const float* in_ptr6,
const float* in_ptr7,
const float* in_ptr8,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
float* out_ptr7)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp10 = in_ptr2[static_cast<long>(x0)];
auto tmp21 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = 50358L;
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = tmp0 < 0;
auto tmp5 = tmp4 ? tmp3 : tmp0;
auto tmp6 = tmp5;
auto tmp7 = c10::convert<int64_t>(tmp6);
TORCH_CHECK((0 <= tmp7) & (tmp7 < 50358L), "index out of bounds: 0 <= tmp7 < 50358L");
auto tmp9 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*tmp5)), 16);
auto tmp11 = 2L;
auto tmp12 = c10::convert<int64_t>(tmp11);
auto tmp13 = decltype(tmp10)(tmp10 + tmp12);
auto tmp14 = tmp10 < 0;
auto tmp15 = tmp14 ? tmp13 : tmp10;
auto tmp16 = tmp15;
auto tmp17 = c10::convert<int64_t>(tmp16);
TORCH_CHECK((0 <= tmp17) & (tmp17 < 2L), "index out of bounds: 0 <= tmp17 < 2L");
auto tmp19 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (768L*tmp15)), 16);
auto tmp20 = tmp9 + tmp19;
auto tmp22 = tmp20 + tmp21; + static_cast<long>(x1 + (768L*x0)));
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp22, &weight_recps);
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = out_ptr1[static_cast<long>(x0)];
auto tmp4 = out_ptr2[static_cast<long>(x0)];
auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1), 16);
auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr6 + static_cast<long>(x1), 16);
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 - tmp2;
auto tmp5 = static_cast<float>(768.0);
auto tmp6 = tmp4 / tmp5;
auto tmp7 = static_cast<float>(1e-12);
auto tmp8 = decltype(tmp6)(tmp6 + tmp7);
auto tmp9 = 1 / std::sqrt(tmp8);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp3 * tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp15 = tmp13 + tmp14; + static_cast<long>(x1 + (768L*x0)));
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(64L + x1 + (64L*x0)), 16); + static_cast<long>(x1 + (192L*x0)));
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(128L + x1 + (64L*x0)), 16); + static_cast<long>(x1 + (192L*x0)));
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(192L + x1 + (64L*x0)), 16); + static_cast<long>(x1 + (192L*x0)));
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
auto tmp0 = in_ptr7[static_cast<long>(128L + x1 + (64L*x0))];
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr8 + static_cast<long>(x2 + (192L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>(tmp0);
auto tmp3 = tmp2 * tmp1; + static_cast<long>(x2 + (192L*x1) + (12288L*x0)));
del async_compile
def call(args):
arg6_1, arg7_1, arg8_1 = args
assert_size_stride(arg6_1, (1, 832), (832, 1))
assert_size_stride(arg7_1, (1, 832), (832, 1))
assert_size_stride(arg8_1, (1, 832), (832, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
buf1 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf4 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
buf8 = empty_strided_cpu((1, 9, 192), (1728, 192, 1), torch.float32)
buf5 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 0) # alias
buf6 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 64) # alias
buf7 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 128) # alias
buf9 = empty_strided_cpu((1, 9, 64, 192), (110592, 12288, 192, 1), torch.float32)
cpp_fused_add_cat_embedding_mul_native_layer_norm_0(arg6_1, _frozen_param0, arg8_1, _frozen_param1, _frozen_param6, _frozen_param3, _frozen_param4, arg7_1, buf8, buf0, buf1, buf2, buf4, buf5, buf6, buf7, buf9)
del arg6_1
del arg8_1
return (buf4, reinterpret_tensor(buf9, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 832, 1), (832, 832, 1, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 1, 832), (832, 832, 832, 1), 0), reinterpret_tensor(arg7_1, (1, 13, 64), (832, 64, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param0
_frozen_param0 = rand_strided((50358, 768), (768, 1), device='cpu', dtype=torch.float32)
global _frozen_param1
_frozen_param1 = rand_strided((2, 768), (768, 1), device='cpu', dtype=torch.float32)
global _frozen_param3
_frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param4
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param6
_frozen_param6 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
arg6_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
arg7_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.float32)
arg8_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
fn = lambda: call([arg6_1, arg7_1, arg8_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:03.542000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:03.542000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "1dfadefa57d2d698b82df0a252ee757b"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].encoder, accessed_by=DictGetItemGuardAccessor(encoder)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].encoder, 139839713378016)
| | | | | +- GuardManager: source=L['self'].encoder.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208)
| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].embeddings.__dict__)
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].embeddings.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.dropout, 139839202278704)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.LayerNorm, 139839202278800)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings, accessed_by=DictGetItemGuardAccessor(word_embeddings)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.word_embeddings, 139839202271840)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings, accessed_by=DictGetItemGuardAccessor(position_embeddings)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_embeddings, 139839202279184)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings, accessed_by=DictGetItemGuardAccessor(token_type_embeddings)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_embeddings, 139839202279328)
| | | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers)
| | | | | | | +- GuardManager: source=L['self'].embeddings.position_ids, accessed_by=DictGetItemGuardAccessor(position_ids)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_ids, 139838528701040)
| | | | | | +- GuardManager: source=L['self'].embeddings._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].embeddings._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].embeddings._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].embeddings.rescale_embeddings, accessed_by=DictGetItemGuardAccessor(rescale_embeddings)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.rescale_embeddings, 7685824)
| | | | | | +- GuardManager: source=L['self'].embeddings._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
| | | | +- EQUALS_MATCH: L['self'].block_size == 64
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 6
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- EQUALS_MATCH: L['___stack0'][0] == 13
| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][1], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][1], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
| | +- GuardManager: source=L['___stack0'][2], accessed_by=TupleGetItemGuardAccessor(2)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][2], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][2], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
| | +- GuardManager: source=L['___stack0'][3], accessed_by=TupleGetItemGuardAccessor(3)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][3], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][3], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
| | +- GuardManager: source=L['___stack0'][4], accessed_by=TupleGetItemGuardAccessor(4)
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][4], 7636800)
| | +- GuardManager: source=L['___stack0'][5], accessed_by=TupleGetItemGuardAccessor(5)
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][5], 7636800)
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['use_cache'], accessed_by=DictGetItemGuardAccessor(use_cache)
| | +- ID_MATCH: ___check_obj_id(L['use_cache'], 7685824)
| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856)
| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values)
| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7685824)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['past_key_values_length'], accessed_by=DictGetItemGuardAccessor(past_key_values_length)
| | +- EQUALS_MATCH: L['past_key_values_length'] == 0
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672)
| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568)
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks
V0627 17:31:03.543000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "5/0", "frame_key": "10", "co_name": "torch_dynamo_resume_in_forward_at_2077", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 2077, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 57, "shape_env_guard_count": 0, "graph_op_count": 18, "graph_node_count": 23, "graph_input_count": 3, "start_time": 1719534662.9121282, "entire_frame_compile_time_s": 0.6307895183563232, "backend_compile_time_s": 0.49609994888305664, "inductor_compile_time_s": 0.37875938415527344, "code_gen_time_s": 0.3245351314544678, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.08347654342651367, "has_guarded_code": true}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.544000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 19, "size": 442368}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 19, "id": 0, "source": "L['band_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 19, "size": 2555904}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 19, "id": 1, "source": "L['hidden_states']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.567000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 19, "size": 3328}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 19, "id": 2, "source": "L['from_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.570000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.570000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 19, "id": 5, "source": "L['to_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.600000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "6/0", "frame_key": "11", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1578, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534663.5449224, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.05504441261291504, "has_guarded_code": false}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.600000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.607000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 20, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.608000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.608000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 20, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 20, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 20, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 20, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.611000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 20, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.612000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.612000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 20, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 21, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 21, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 21, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 21, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 21, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 21, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.646000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.647000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 21, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.648000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.648000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 21, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.656000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:03.656000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "76aa2c3aac969b0b973556e5e5d20d8b"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202275632)
| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value)
| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask)
| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)
V0627 17:31:03.657000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "7/0", "frame_key": "12", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534663.6010149, "entire_frame_compile_time_s": 0.05594229698181152, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.039438724517822266, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.657000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 22, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 22, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 22, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 22, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.661000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 22, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 22, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.664000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.664000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 22, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 23, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 23, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 23, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 23, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.696000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 23, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.696000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.697000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.697000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 23, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.698000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.699000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 23, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.701000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.701000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 23, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.704000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"}
class GraphModule(torch.nn.Module):
def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
l_to_mask_ = L_to_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: band_mask =
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" =; l_band_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: from_mask =
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" =; l_from_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: to_mask =
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" =; l_to_mask_ = None
return (band_mask, from_mask, to_mask)
V0627 17:31:03.718000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
return (arg0_1, arg1_1, arg2_1)
V0627 17:31:03.731000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:03.731000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "6f76e9e822f6dc2ebb0dbc0f0100927d"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202274384)
| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:03.732000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "8/0", "frame_key": "13", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534663.657894, "entire_frame_compile_time_s": 0.07398724555969238, "backend_compile_time_s": 0.02206587791442871, "inductor_compile_time_s": 0.0003921985626220703, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03484821319580078, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.732000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 25, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 25}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 25, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 26, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 26, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 7, "describer_id": 26, "size": 442368}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 26, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 8, "describer_id": 26, "size": 3328}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 26, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.765000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.765000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 26, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.766000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.766000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 26, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.768000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"}
class GraphModule(torch.nn.Module):
def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_hidden_states_ = L_hidden_states_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64); l__self___query = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3); x = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64); l__self___key = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3); x_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_); l_hidden_states_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64); l__self___value = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3); x_2 = None
return (query_layer, key_layer, value_layer)
V0627 17:31:03.817000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]); view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None
convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None
convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]); convert_element_type_8 = None
permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2); convert_element_type_6 = view_3 = permute_2 = None
view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]); view_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16); arg5_1 = None
convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]); convert_element_type_14 = None
permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]); convert_element_type_13 = None
addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4); convert_element_type_12 = view_6 = permute_4 = None
view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]); addmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]); view_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:03.886000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
_frozen_param6: "bf16[768][1]cpu" = self._frozen_param6
# No stacktrace found for following nodes
_frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
# No stacktrace found for following nodes
_frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_frozen_param10: "bf16[768][1]cpu" = self._frozen_param10
# No stacktrace found for following nodes
_frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
_linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:03.909000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/rm/"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "a40072c55bb96853547fea577aa47ba2"}
# AOT ID: ['5_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311efab0
_frozen_param12 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311c2750
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311b8770
_frozen_param13 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311cefc0
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311dbbf0
_frozen_param14 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311ac090
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0); + static_cast<long>(x0), 16);
del async_compile
def call(args):
arg6_1, = args
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg6_1, buf0)
del arg6_1
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param6
_frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param12
_frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param8
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param13
_frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param10
_frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param14
_frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg6_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:03.922000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:03.923000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "300400f770725170203fcbe28e6ee223"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202273568)
| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202273616)
| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202273040)
| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed)
| | | | +- EQUALS_MATCH: L['self'].seed == 0
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
| | | | +- EQUALS_MATCH: L['self'].block_size == 64
| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks)
| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3
| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64
| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads)
| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:03.923000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "9/0", "frame_key": "14", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534663.7326608, "entire_frame_compile_time_s": 0.19047832489013672, "backend_compile_time_s": 0.14537477493286133, "inductor_compile_time_s": 0.0376286506652832, "code_gen_time_s": 0.016646862030029297, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.018494129180908203, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.925000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.985000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:03.986000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "f6f5661bfad0dc293ecc9ef35ede39a0"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed)
| | +- EQUALS_MATCH: L['seed'] == 0
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | +- EQUALS_MATCH: L['attention_head_size'] == 64
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464)
| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed)
| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264)
| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math)
| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744)
| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344)
V0627 17:31:03.986000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "10/0", "frame_key": "15", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 516, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.9255476, "entire_frame_compile_time_s": 0.06077218055725098, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.008862972259521484, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.987000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.076000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:04.077000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "f3df28d4d21dab674ac56179543067e7"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
| | +- EQUALS_MATCH: L['n_heads'] == 12
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___37'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___37)
| | | +- GuardManager: source=G['__builtins_dict___37']['int'], accessed_by=DictGetItemGuardAccessor(int)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___37']['int'], 7648640)
V0627 17:31:04.077000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "11/0", "frame_key": "16", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.987453, "entire_frame_compile_time_s": 0.08972334861755371, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.02129364013671875, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:04.078000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 12, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.094000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"wrapped_array": [2], "plan_block_length": [2]}}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "9f82d9593d608d32ba61e6298aeb3649"}
class GraphModule(torch.nn.Module):
def forward(self):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
wrapped_array: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_array([704, 832])
plan_block_length: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_floordiv(wrapped_array, 64); wrapped_array = None
return (plan_block_length,)
V0627 17:31:04.106000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "1861734e40a5f61860344b326195085c"}
class <lambda>(torch.nn.Module):
def forward(self):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
_tensor_constant0 = self._tensor_constant0
lift_fresh_copy: "i64[2][1]cpu" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0); _tensor_constant0 = None
clone: "i64[2][1]cpu" = torch.ops.aten.clone.default(lift_fresh_copy); lift_fresh_copy = None
div: "i64[2][1]cpu" = torch.ops.aten.div.Tensor_mode(clone, 64, rounding_mode = 'floor'); clone = None
return (div,)
V0627 17:31:04.122000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:04.122000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "c1fc05dff62c1bc070ea06a12430d940"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
| | +- EQUALS_MATCH: L['num_heads'] == 12
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['to_seq_length'], accessed_by=DictGetItemGuardAccessor(to_seq_length)
| | +- EQUALS_MATCH: L['to_seq_length'] == 832
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length)
| | +- EQUALS_MATCH: L['from_seq_length'] == 832
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
| | +- TYPE_MATCH: ___check_type_id(L['plan_from_length'], 7650400)
| | +- LENGTH_CHECK: len(L['plan_from_length']) == 2
| | +- GuardManager: source=L['plan_from_length'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- EQUALS_MATCH: L['plan_from_length'][0] == 704
| | +- GuardManager: source=L['plan_from_length'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- EQUALS_MATCH: L['plan_from_length'][1] == 832
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].array, accessed_by=GetAttrGuardAccessor(array)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].array, 139845228959664)
| | +- GuardManager: source=G['__builtins_dict___40'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___40)
| | | +- GuardManager: source=G['__builtins_dict___40']['list'], accessed_by=DictGetItemGuardAccessor(list)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['list'], 7650400)
| | | +- GuardManager: source=G['__builtins_dict___40']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['range'], 7632448)
| | | +- GuardManager: source=G['__builtins_dict___40']['enumerate'], accessed_by=DictGetItemGuardAccessor(enumerate)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['enumerate'], 7513024)
| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336)
V0627 17:31:04.122000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "12/0", "frame_key": "17", "co_name": "_bigbird_block_rand_mask_with_head", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1111, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534664.0783854, "entire_frame_compile_time_s": 0.04439258575439453, "backend_compile_time_s": 0.01859426498413086, "inductor_compile_time_s": 0.00021767616271972656, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.00835871696472168, "has_guarded_code": true}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:04.123000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.127000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "13/0", "frame_key": "18", "co_name": "<listcomp>", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534664.1237168, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0032224655151367188, "has_guarded_code": false}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.127000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dafc0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c3510>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 3, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db3dd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 4, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 3, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1fd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 5, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 4, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311daf70>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 6, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.139000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 5, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.139000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311b8ea0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.140000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 7, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 6, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ba200>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 8, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 7, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119dd50>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 9, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 8, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c6d0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 10, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.145000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 9, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.146000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c4a0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.146000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 11, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 10, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 12, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 12, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.148000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 11, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.149000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 13, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4df30>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.149000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 36, "id": 13, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.153000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None))); l_stack0_0_ = None
wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None))); l_stack0_1_ = None
wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None))); l_stack0_2_ = None
wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None))); l_stack0_3_ = None
wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None))); l_stack0_4_ = None
wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None))); l_stack0_5_ = None
wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None))); l_stack0_6_ = None
wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None))); l_stack0_7_ = None
wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None))); l_stack0_8_ = None
wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None))); l_stack0_9_ = None
wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None))); l_stack0_10_ = None
wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None))); l_stack0_11_ = None
return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11)
V0627 17:31:04.205000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807); slice_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None
return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24)
V0627 17:31:04.236000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0627 17:31:04.250000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ki/"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "6641116284eedbc64e23effbbbfe40e6"}
# AOT ID: ['7_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
assert_size_stride(arg0_1, (13, 3), (3, 1))
assert_size_stride(arg1_1, (13, 3), (3, 1))
assert_size_stride(arg2_1, (13, 3), (3, 1))
assert_size_stride(arg3_1, (13, 3), (3, 1))
assert_size_stride(arg4_1, (13, 3), (3, 1))
assert_size_stride(arg5_1, (13, 3), (3, 1))
assert_size_stride(arg6_1, (13, 3), (3, 1))
assert_size_stride(arg7_1, (13, 3), (3, 1))
assert_size_stride(arg8_1, (13, 3), (3, 1))
assert_size_stride(arg9_1, (13, 3), (3, 1))
assert_size_stride(arg10_1, (13, 3), (3, 1))
assert_size_stride(arg11_1, (13, 3), (3, 1))
return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:04.258000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:04.259000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "bb9e1aaf4decc7f300fbb51ff6f34967"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
| | +- LENGTH_CHECK: len(L['___stack0']) == 12
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
| | +- EQUALS_MATCH: L['num_heads'] == 12
| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks)
| | +- EQUALS_MATCH: L['num_blocks'] == 13
| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top)
| | +- EQUALS_MATCH: L['global_block_top'] == 1
| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom)
| | +- EQUALS_MATCH: L['global_block_bottom'] == 1
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___44'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___44)
| | | +- GuardManager: source=G['__builtins_dict___44']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___44']['range'], 7632448)
V0627 17:31:04.259000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "14/0", "frame_key": "19", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534664.1278515, "entire_frame_compile_time_s": 0.13145732879638672, "backend_compile_time_s": 0.09916210174560547, "inductor_compile_time_s": 0.022524356842041016, "code_gen_time_s": 0.003596067428588867, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.260000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b7060>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ad1c0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c810>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 3, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119ef70>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 4, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c58a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 5, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1d00>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 6, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b9f30>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 7, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ac310>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 8, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310bb100>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 9, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 10, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4c450>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 11, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310f5e90>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 12, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c6840>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 13, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.283000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 13, "describer_id": 38, "size": 3328}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.283000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 17, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.284000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 16, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.284000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 16, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.293000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 14, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.294000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 31, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c73d0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.294000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 31, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 15, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 32, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c7470>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 32, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.316000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 50, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.317000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 50, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 16, "describer_id": 38, "size": 442368}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 124, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 124, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.439000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 182, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.439000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 38, "id": 182, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.456000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
l_query_layer_ = L_query_layer_
l_from_blocked_mask_ = L_from_blocked_mask_
l_key_layer_ = L_key_layer_
l_value_layer_ = L_value_layer_
l_to_mask_ = L_to_mask_
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0); l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64); rand_attn = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn =[rand_attn for _ in range(batch_size)], dim=0)
rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" =[rand_attn_1], dim = 0); rand_attn_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0]
i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
flatten: "i64[396][1]cpu" = i1.flatten(); i1 = None
getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten]; p1 = flatten = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]); getitem_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192); rand_mask = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
getitem_3: "f32[1, 11, 64][832, 64, 1]cpu" = l_from_blocked_mask_[(slice(None, None, None), slice(1, -1, None))]; l_from_blocked_mask_ = None
rand_mask_2: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.functional.einsum('blq,bhlk->bhlqk', getitem_3, rand_mask_1); getitem_3 = rand_mask_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
blocked_query_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_query_layer_.view(1, 12, 13, 64, -1); l_query_layer_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
blocked_key_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_key_layer_.view(1, 12, 13, 64, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
blocked_value_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_value_layer_.view(1, 12, 13, 64, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
shift: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu'))
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div: "i64[396][1]cpu" = torch.div(shift, 33, rounding_mode = 'floor'); shift = None
indices_shift: "i64[396][1]cpu" = div * 13; div = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_4: "i64[396][1]cpu" = rand_attn_2.view(-1)
flattened_indices: "i64[396][1]cpu" = view_4 + indices_shift; view_4 = indices_shift = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
flattened_params: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_key_matrix.reshape(-1, 64, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
out_flattened: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params.index_select(0, flattened_indices); flattened_params = flattened_indices = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
out: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened.reshape((1, 12, 33, 64, 64)); out_flattened = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
gathered_key: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out.view(1, 12, 11, 192, -1); out = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
shift_1: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu'))
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div_1: "i64[396][1]cpu" = torch.div(shift_1, 33, rounding_mode = 'floor'); shift_1 = None
indices_shift_1: "i64[396][1]cpu" = div_1 * 13; div_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_6: "i64[396][1]cpu" = rand_attn_2.view(-1)
flattened_indices_1: "i64[396][1]cpu" = view_6 + indices_shift_1; view_6 = indices_shift_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
flattened_params_1: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_value_matrix.reshape(-1, 64, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
out_flattened_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params_1.index_select(0, flattened_indices_1); flattened_params_1 = flattened_indices_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
out_1: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened_1.reshape((1, 12, 33, 64, 64)); out_flattened_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
gathered_value: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out_1.view(1, 12, 11, 192, -1); out_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
getitem_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_4: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_4.reshape((-1, 64, 64)); getitem_4 = None
reshape_5: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64))
transpose: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_5.transpose(1, 2); reshape_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_4, transpose); reshape_4 = transpose = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
first_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm.view((1, 12, 64, 832)); bmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
first_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product * 0.125; first_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_
mul_3: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub * -10000.0; sub = None
first_product_1 += mul_3; first_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product_1; first_product_1 = mul_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
first_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(first_product_2, dim = -1); first_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_6: "bf16[12, 64, 832][53248, 832, 1]cpu" = first_attn_weights.reshape((-1, 64, 832)); first_attn_weights = None
reshape_7: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64))
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_6, reshape_7); reshape_6 = reshape_7 = None
first_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_1.view((1, 12, 64, 64)); bmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
unsqueeze__1: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = first_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
getitem_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
getitem_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
getitem_7: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
getitem_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
getitem_9: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat =
second_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[getitem_5, getitem_6, getitem_7, getitem_8, getitem_9], dim = 2); getitem_5 = getitem_6 = getitem_7 = getitem_8 = getitem_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
getitem_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
getitem_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
getitem_12: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
getitem_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
getitem_14: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat =
second_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[getitem_10, getitem_11, getitem_12, getitem_13, getitem_14], dim = 2); getitem_10 = getitem_11 = getitem_12 = getitem_13 = getitem_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
getitem_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_8: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_15.reshape((-1, 64, 64)); getitem_15 = None
reshape_9: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_key_mat.reshape((-1, 448, 64)); second_key_mat = None
transpose_1: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_9.transpose(1, 2); reshape_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_8, transpose_1); reshape_8 = transpose_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
second_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_2.view((1, 12, 64, 448)); bmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
getitem_16: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 192, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
getitem_17: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
new_ones: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad =
second_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" =[getitem_16, getitem_17, new_ones], dim = 3); getitem_16 = getitem_17 = new_ones = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
new_ones_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
getitem_18: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad =
second_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" =[new_ones_1, getitem_18], dim = 3); new_ones_1 = getitem_18 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
second_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product * 0.125; second_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_seq_pad, second_rand_pad); second_seq_pad = second_rand_pad = None
sub_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum; minimum = None
mul_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_1 * -10000.0; sub_1 = None
second_product_1 += mul_5; second_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product_1; second_product_1 = mul_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
second_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_product_2, dim = -1); second_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_10: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_attn_weights.reshape((-1, 64, 448)); second_attn_weights = None
reshape_11: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_value_mat.reshape((-1, 448, 64)); second_value_mat = None
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_10, reshape_11); reshape_10 = reshape_11 = None
second_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_3.view((1, 12, 64, 64)); bmm_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
unsqueeze__2: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
getitem_19: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))]
getitem_20: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
getitem_21: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix =
exp_blocked_key_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" =[getitem_19, getitem_20, getitem_21], dim = 3); getitem_19 = getitem_20 = getitem_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
getitem_22: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))]
getitem_23: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
getitem_24: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix =
exp_blocked_value_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" =[getitem_22, getitem_23, getitem_24], dim = 3); getitem_22 = getitem_23 = getitem_24 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
middle_query_matrix: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_12: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64))
reshape_13: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_key_matrix.reshape((-1, 192, 64)); exp_blocked_key_matrix = None
transpose_2: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_13.transpose(1, 2); reshape_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_12, transpose_2); reshape_12 = transpose_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
inner_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_4.view((1, 12, 9, 64, 192)); bmm_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
inner_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product * 0.125; inner_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
getitem_26: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_14: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64))
reshape_15: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_26.reshape((-1, 192, 64)); getitem_26 = None
transpose_3: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_15.transpose(1, 2); reshape_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_14, transpose_3); reshape_14 = transpose_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
rand_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_5.view((1, 12, 9, 64, 192)); bmm_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
rand_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product * 0.125; rand_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]
getitem_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
first_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_27); getitem_27 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
first_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product * 0.125; first_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]
getitem_28: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
last_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_28); middle_query_matrix = getitem_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
last_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product * 0.125; last_band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
sub_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = 1.0 - l_band_mask_; l_band_mask_ = None
mul_10: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = sub_2 * -10000.0; sub_2 = None
inner_band_product_1 += mul_10; inner_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product_1; inner_band_product_1 = mul_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
getitem_29: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
unsqueeze: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_29.unsqueeze(3); getitem_29 = None
sub_3: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze; unsqueeze = None
mul_11: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_3 * -10000.0; sub_3 = None
first_band_product_1 += mul_11; first_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product_1; first_band_product_1 = mul_11 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
getitem_30: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]
unsqueeze_1: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_30.unsqueeze(3); getitem_30 = None
sub_4: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze_1; unsqueeze_1 = None
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_4 * -10000.0; sub_4 = None
last_band_product_1 += mul_12; last_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product_1; last_band_product_1 = mul_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
getitem_31: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]
sub_5: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = 1.0 - getitem_31; getitem_31 = None
mul_13: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = sub_5 * -10000.0; sub_5 = None
rand_band_product_1 += mul_13; rand_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product_1; rand_band_product_1 = mul_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product =
band_product: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" =[first_band_product_2, inner_band_product_2, rand_band_product_2, last_band_product_2], dim = -1); first_band_product_2 = inner_band_product_2 = rand_band_product_2 = last_band_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
attn_weights: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.nn.functional.softmax(band_product, dim = -1); band_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
getitem_32: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(64, 256, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_16: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_32.reshape((-1, 64, 192)); getitem_32 = None
reshape_17: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_value_matrix.reshape((-1, 192, 64)); exp_blocked_value_matrix = None
bmm_6: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_16, reshape_17); reshape_16 = reshape_17 = None
context_layer: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_6.view((1, 12, 9, 64, 64)); bmm_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
getitem_33: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(256, -64, None))]
getitem_34: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_18: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_33.reshape((-1, 64, 192)); getitem_33 = None
reshape_19: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_34.reshape((-1, 192, 64)); getitem_34 = None
bmm_7: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_18, reshape_19); reshape_18 = reshape_19 = None
view_15: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_7.view((1, 12, 9, 64, 64)); bmm_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
context_layer += view_15; context_layer_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer; context_layer = view_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
getitem_35: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
getitem_36: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
einsum_3: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_35, getitem_36); getitem_35 = getitem_36 = None
context_layer_1 += einsum_3; context_layer_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_1; context_layer_1 = einsum_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
getitem_37: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]; attn_weights = None
getitem_38: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
einsum_4: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_37, getitem_38); getitem_37 = getitem_38 = None
context_layer_2 += einsum_4; context_layer_3: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_2; context_layer_2 = einsum_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
getitem_39: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
getitem_40: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -3)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
getitem_41: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
getitem_42: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_key_matrix = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
getitem_43: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), -1)]; gathered_key = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat =
second_last_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[getitem_39, getitem_40, getitem_41, getitem_42, getitem_43], dim = 2); getitem_39 = getitem_40 = getitem_41 = getitem_42 = getitem_43 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
getitem_44: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
getitem_45: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -3)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
getitem_46: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
getitem_47: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_value_matrix = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
getitem_48: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), -1)]; gathered_value = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat =
second_last_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[getitem_44, getitem_45, getitem_46, getitem_47, getitem_48], dim = 2); getitem_44 = getitem_45 = getitem_46 = getitem_47 = getitem_48 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
getitem_49: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -2)]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_20: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_49.reshape((-1, 64, 64)); getitem_49 = None
reshape_21: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_key_mat.reshape((-1, 448, 64)); second_last_key_mat = None
transpose_4: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_21.transpose(1, 2); reshape_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_8: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_20, transpose_4); reshape_20 = transpose_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
second_last_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_8.view((1, 12, 64, 448)); bmm_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size],
getitem_50: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
getitem_51: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-192, None, None))]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
new_ones_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad =
second_last_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" =[getitem_50, getitem_51, new_ones_2], dim = 3); getitem_50 = getitem_51 = new_ones_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
new_ones_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
getitem_52: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), -1)]; rand_mask_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad =
second_last_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" =[new_ones_3, getitem_52], dim = 3); new_ones_3 = getitem_52 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
second_last_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product * 0.125; second_last_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_last_seq_pad, second_last_rand_pad); second_last_seq_pad = second_last_rand_pad = None
sub_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum_1; minimum_1 = None
mul_15: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_6 * -10000.0; sub_6 = None
second_last_product_1 += mul_15; second_last_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product_1; second_last_product_1 = mul_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
second_last_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_last_product_2, dim = -1); second_last_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_22: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_last_attn_weights.reshape((-1, 64, 448)); second_last_attn_weights = None
reshape_23: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_value_mat.reshape((-1, 448, 64)); second_last_value_mat = None
bmm_9: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_22, reshape_23); reshape_22 = reshape_23 = None
second_last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_9.view((1, 12, 64, 64)); bmm_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
unsqueeze__3: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_last_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
getitem_53: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -1)]; blocked_query_matrix = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
reshape_24: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_53.reshape((-1, 64, 64)); getitem_53 = None
reshape_25: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64)); l_key_layer_ = None
transpose_5: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_25.transpose(1, 2); reshape_25 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_10: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_24, transpose_5); reshape_24 = transpose_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
last_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm_10.view((1, 12, 64, 832)); bmm_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
last_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product * 0.125; last_product = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
sub_7: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_; l_to_mask_ = None
mul_17: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub_7 * -10000.0; sub_7 = None
last_product_1 += mul_17; last_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product_1; last_product_1 = mul_17 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
last_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(last_product_2, dim = -1); last_product_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
reshape_26: "bf16[12, 64, 832][53248, 832, 1]cpu" = last_attn_weights.reshape((-1, 64, 832)); last_attn_weights = None
reshape_27: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64)); l_value_layer_ = None
bmm_11: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_26, reshape_27); reshape_26 = reshape_27 = None
last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_11.view((1, 12, 64, 64)); bmm_11 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
unsqueeze__4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = last_context_layer.unsqueeze_(2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer =
context_layer_4: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" =[first_context_layer, second_context_layer, context_layer_3, second_last_context_layer, last_context_layer], dim = 2); first_context_layer = second_context_layer = context_layer_3 = second_last_context_layer = last_context_layer = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
view_20: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = context_layer_4.view((1, 12, 832, -1)); context_layer_4 = None
context_layer_5: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = view_20 * l_from_mask_; view_20 = l_from_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
context_layer_6: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.transpose(context_layer_5, 1, 2); context_layer_5 = None
return (context_layer_6, rand_attn_2)
V0627 17:31:05.175000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "9d92a7e58f208e3c617d3e5fb4f3ee25"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
cat: "i32[132, 3][3, 1]cpu" =[arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.view.default(cat, [12, 11, 3]); cat = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
alias: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(view); view = None
alias_1: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias); alias = None
alias_2: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias_1); alias_1 = None
convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(alias_2, torch.int64); alias_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn =[rand_attn for _ in range(batch_size)], dim=0)
clone: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.clone.default(unsqueeze); unsqueeze = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
select: "f32[13, 64][64, 1]cpu" =, 0, 0)
select_1: "i64[12, 11, 3][33, 3, 1]cpu" =, 0, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
view_1: "i64[396][1]cpu" = torch.ops.aten.view.default(select_1, [396]); select_1 = None
index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
clone_1: "f32[396, 64][64, 1]cpu" = torch.ops.aten.clone.default(index); index = None
view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.view.default(clone_1, [1, 396, 64]); clone_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.view.default(view_2, [1, 12, 11, 192]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 0, 0, 9223372036854775807); arg13_1 = None
slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -1); slice_1 = None
unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None
unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None
permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None
unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None
permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None
mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [1, 12, 13, 64, -1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [1, 12, 13, 64, -1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None
mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13); div = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_7: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1])
add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_2, [156, 64, 64]); clone_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = add = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_1, [1, 12, 33, 64, 64]); index_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_9, [1, 12, 11, 192, -1]); view_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
iota_1: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div_1: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota_1, 33, rounding_mode = 'floor'); iota_1 = None
mul_2: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div_1, 13); div_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_11: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1])
add_1: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_11, mul_2); view_11 = mul_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_3, [156, 64, 64]); clone_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add_1]); view_12 = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_2, [1, 12, 33, 64, 64]); index_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_13, [1, 12, 11, 192, -1]); view_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
slice_3: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None
select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0); slice_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_2, [12, 64, 64]); select_2 = None
view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64])
permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = permute_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm, [1, 12, 64, 832]); bmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None
add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = mul_4 = None
convert_element_type_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_2, torch.bfloat16); add_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
convert_element_type_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_3, torch.float32); convert_element_type_3 = None
amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_4, [-1], True)
sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_4, amax); convert_element_type_4 = amax = None
exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1); sub_1 = None
sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None
convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None
view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64])
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = view_19 = None
view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
slice_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None
select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0); slice_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
slice_7: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_8: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None
select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 1); slice_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
slice_9: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_10: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None
select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 2); slice_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
slice_11: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_12: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None
select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); slice_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
slice_13: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807)
slice_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None
select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, 0); slice_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat =
cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_3, select_4, select_5, select_6, select_7], 2); select_3 = select_4 = select_5 = select_6 = select_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
slice_15: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_16: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None
select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0); slice_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
slice_17: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_18: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None
select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 1); slice_18 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
slice_19: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_20: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None
select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 2); slice_20 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
slice_21: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_22: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None
select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); slice_22 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
slice_23: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807)
slice_24: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None
select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, 0); slice_24 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat =
cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_8, select_9, select_10, select_11, select_12], 2); select_8 = select_9 = select_10 = select_11 = select_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
slice_25: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_26: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_25, 1, 0, 9223372036854775807); slice_25 = None
select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 1); slice_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_13, [12, 64, 64]); select_13 = None
view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_1, [-1, 448, 64]); cat_1 = None
permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
slice_27: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_28: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_27, 1, 0, 9223372036854775807); slice_27 = None
slice_29: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_28, 2, 0, 9223372036854775807); slice_28 = None
slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_29, 3, 0, 192); slice_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
slice_31: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_32: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_31, 1, 0, 9223372036854775807); slice_31 = None
slice_33: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_32, 2, 0, 9223372036854775807); slice_32 = None
slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_33, 3, -64, 9223372036854775807); slice_33 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
full: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad =
cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" =[slice_30, slice_34, full], 3); slice_30 = slice_34 = full = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
full_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
slice_35: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807)
slice_36: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_35, 1, 0, 9223372036854775807); slice_35 = None
select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" =, 2, 0); slice_36 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad =
cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" =[full_1, select_14], 3); full_1 = select_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None
sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None
mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None
add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None
convert_element_type_10: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_3, torch.bfloat16); add_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
convert_element_type_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_10, torch.float32); convert_element_type_10 = None
amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_11, [-1], True)
sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_11, amax_1); convert_element_type_11 = amax_1 = None
exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3); sub_3 = None
sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None
convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None
view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_2, [-1, 448, 64]); cat_2 = None
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None
view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
slice_37: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_38: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_37, 1, 0, 9223372036854775807); slice_37 = None
slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_38, 2, 1, -3); slice_38 = None
slice_40: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_41: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_40, 1, 0, 9223372036854775807); slice_40 = None
slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_41, 2, 2, -2); slice_41 = None
slice_43: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_44: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_43, 1, 0, 9223372036854775807); slice_43 = None
slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_44, 2, 3, -1); slice_44 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix =
cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" =[slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
slice_46: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_47: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_46, 1, 0, 9223372036854775807); slice_46 = None
slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_47, 2, 1, -3); slice_47 = None
slice_49: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_50: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_49, 1, 0, 9223372036854775807); slice_49 = None
slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_50, 2, 2, -2); slice_50 = None
slice_52: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_53: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_52, 1, 0, 9223372036854775807); slice_52 = None
slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_53, 2, 3, -1); slice_53 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix =
cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" =[slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
slice_55: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_56: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_55, 1, 0, 9223372036854775807); slice_55 = None
slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_56, 2, 2, -2); slice_56 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_4, [108, 64, 64]); clone_4 = None
view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_5, [-1, 192, 64]); cat_5 = None
permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4); view_27 = permute_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
slice_58: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807)
slice_59: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_58, 1, 0, 9223372036854775807); slice_58 = None
slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_59, 2, 1, -1); slice_59 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_5: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
view_30: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_5, [108, 64, 64]); clone_5 = None
clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None
view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_6, [108, 192, 64]); clone_6 = None
permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_30, permute_5); view_30 = permute_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]
slice_61: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_62: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_61, 1, 0, 9223372036854775807); slice_61 = None
select_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0); slice_62 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None
unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_15, 4); select_15 = None
unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None
permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None
permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None
view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_8, [12, 576, 64]); permute_8 = None
permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None
view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_9, [12, 64, 64]); permute_9 = None
bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34); view_33 = view_34 = None
view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None
permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None
view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]
slice_63: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_64: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_63, 1, 0, 9223372036854775807); slice_63 = None
select_16: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); slice_64 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
unsqueeze_9: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5); slice_57 = None
permute_11: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_9, [0, 1, 2, 3, 5, 4]); unsqueeze_9 = None
unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_16, 4); select_16 = None
unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None
permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None
permute_13: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_11, [1, 2, 3, 5, 0, 4]); permute_11 = None
view_37: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_13, [12, 576, 64]); permute_13 = None
permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None
view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_14, [12, 64, 64]); permute_14 = None
bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_37, view_38); view_37 = view_38 = None
view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None
permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None
view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None
mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None
add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None
convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
slice_65: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_66: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_65, 1, 0, 9223372036854775807); slice_65 = None
slice_67: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_66, 2, 0, 9223372036854775807); slice_66 = None
slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_67, 3, 0, 64); slice_67 = None
unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3); slice_68 = None
sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None
add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None
convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
slice_69: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_70: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_69, 1, 0, 9223372036854775807); slice_69 = None
slice_71: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_70, 2, 0, 9223372036854775807); slice_70 = None
slice_72: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_71, 3, -64, 9223372036854775807); slice_71 = None
unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_72, 3); slice_72 = None
sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None
mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None
add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None
convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
slice_73: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807)
slice_74: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_73, 1, 0, 9223372036854775807); slice_73 = None
slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_74, 2, 1, -1); slice_74 = None
sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None
mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None
add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None
convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product =
cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" =[convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None
amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None
exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8); sub_8 = None
sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None
convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
slice_76: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
slice_77: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_76, 1, 0, 9223372036854775807); slice_76 = None
slice_78: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_77, 2, 0, 9223372036854775807); slice_77 = None
slice_79: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_78, 3, 0, 9223372036854775807); slice_78 = None
slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_79, 4, 64, 256); slice_79 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_80, [108, 64, 192]); slice_80 = None
view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_6, [-1, 192, 64]); cat_6 = None
bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None
view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
slice_81: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
slice_82: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_81, 1, 0, 9223372036854775807); slice_81 = None
slice_83: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_82, 2, 0, 9223372036854775807); slice_82 = None
slice_84: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_83, 3, 0, 9223372036854775807); slice_83 = None
slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_84, 4, 256, -64); slice_84 = None
slice_86: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807)
slice_87: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_86, 1, 0, 9223372036854775807); slice_86 = None
slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_87, 2, 1, -1); slice_87 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_85, [108, 64, 192]); slice_85 = None
clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None
view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_7, [108, 192, 64]); clone_7 = None
bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None
view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None
view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_8, [108, 64, 64]); add_8 = None
view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_47, [1, 12, 9, 64, 64]); view_47 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
slice_89: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
slice_90: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_89, 1, 0, 9223372036854775807); slice_89 = None
slice_91: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_90, 2, 0, 9223372036854775807); slice_90 = None
slice_92: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_91, 3, 0, 9223372036854775807); slice_91 = None
slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_92, 4, 0, 64); slice_92 = None
slice_94: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_95: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_94, 1, 0, 9223372036854775807); slice_94 = None
select_17: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0); slice_95 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None
permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None
unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_17, 4); select_17 = None
unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None
permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None
permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None
view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_18, [12, 576, 64]); permute_18 = None
permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None
view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_19, [12, 64, 64]); permute_19 = None
bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None
view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None
permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None
view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None
add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None
view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_9, [108, 64, 64]); add_9 = None
view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_53, [1, 12, 9, 64, 64]); view_53 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
slice_96: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807); convert_element_type_28 = None
slice_97: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_96, 1, 0, 9223372036854775807); slice_96 = None
slice_98: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_97, 2, 0, 9223372036854775807); slice_97 = None
slice_99: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_98, 3, 0, 9223372036854775807); slice_98 = None
slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_99, 4, -64, 9223372036854775807); slice_99 = None
slice_101: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_102: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_101, 1, 0, 9223372036854775807); slice_101 = None
select_18: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); slice_102 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None
permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None
unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_18, 4); select_18 = None
unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None
permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None
permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None
view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_23, [12, 576, 64]); permute_23 = None
permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None
view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_24, [12, 64, 64]); permute_24 = None
bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None
view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None
permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None
view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None
add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None
view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_10, [108, 64, 64]); add_10 = None
view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_59, [1, 12, 9, 64, 64]); view_59 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
slice_103: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_104: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_103, 1, 0, 9223372036854775807); slice_103 = None
select_19: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0); slice_104 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
slice_105: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_106: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_105, 1, 0, 9223372036854775807); slice_105 = None
select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -3); slice_106 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
slice_107: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
slice_108: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_107, 1, 0, 9223372036854775807); slice_107 = None
select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -2); slice_108 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
slice_109: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807); view_5 = None
slice_110: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_109, 1, 0, 9223372036854775807); slice_109 = None
select_22: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); slice_110 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
slice_111: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807); view_10 = None
slice_112: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_111, 1, 0, 9223372036854775807); slice_111 = None
select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, -1); slice_112 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat =
cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_19, select_20, select_21, select_22, select_23], 2); select_19 = select_20 = select_21 = select_22 = select_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
slice_113: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_114: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_113, 1, 0, 9223372036854775807); slice_113 = None
select_24: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0); slice_114 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
slice_115: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_116: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_115, 1, 0, 9223372036854775807); slice_115 = None
select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -3); slice_116 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
slice_117: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
slice_118: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_117, 1, 0, 9223372036854775807); slice_117 = None
select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -2); slice_118 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
slice_119: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807); view_6 = None
slice_120: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_119, 1, 0, 9223372036854775807); slice_119 = None
select_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); slice_120 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
slice_121: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807); view_14 = None
slice_122: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_121, 1, 0, 9223372036854775807); slice_121 = None
select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, -1); slice_122 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat =
cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_24, select_25, select_26, select_27, select_28], 2); select_24 = select_25 = select_26 = select_27 = select_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
slice_123: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
slice_124: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_123, 1, 0, 9223372036854775807); slice_123 = None
select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -2); slice_124 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_29, [12, 64, 64]); select_29 = None
view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_8, [-1, 448, 64]); cat_8 = None
permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size],
slice_125: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_126: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_125, 1, 0, 9223372036854775807); slice_125 = None
slice_127: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_126, 2, 0, 9223372036854775807); slice_126 = None
slice_128: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_127, 3, 0, 64); slice_127 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
slice_129: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
slice_130: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_129, 1, 0, 9223372036854775807); slice_129 = None
slice_131: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_130, 2, 0, 9223372036854775807); slice_130 = None
slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_131, 3, -192, 9223372036854775807); slice_131 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
full_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad =
cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" =[slice_128, slice_132, full_2], 3); slice_128 = slice_132 = full_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
full_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
slice_133: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807); mul = None
slice_134: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_133, 1, 0, 9223372036854775807); slice_133 = None
select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" =, 2, -1); slice_134 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad =
cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" =[full_3, select_30], 3); full_3 = select_30 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None
sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None
mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None
add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None
convert_element_type_39: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_11, torch.bfloat16); add_11 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
convert_element_type_40: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_39, torch.float32); convert_element_type_39 = None
amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_40, [-1], True)
sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_40, amax_3); convert_element_type_40 = amax_3 = None
exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10); sub_10 = None
sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None
convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None
view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_9, [-1, 448, 64]); cat_9 = None
bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None
view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
slice_135: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807); view_4 = None
slice_136: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_135, 1, 0, 9223372036854775807); slice_135 = None
select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); slice_136 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_31, [12, 64, 64]); select_31 = None
view_68: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64]); arg14_1 = None
permute_27: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_68, [0, 2, 1]); view_68 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_27); view_67 = permute_27 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
sub_11: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1); arg16_1 = None
mul_18: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub_11, -10000.0); sub_11 = None
add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_18); mul_17 = mul_18 = None
convert_element_type_46: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_12, torch.bfloat16); add_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
convert_element_type_47: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_46, torch.float32); convert_element_type_46 = None
amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_47, [-1], True)
sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_47, amax_4); convert_element_type_47 = amax_4 = None
exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12); sub_12 = None
sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None
convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None
view_71: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64]); arg15_1 = None
bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_71); view_70 = view_71 = None
view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer =
cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" =[unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.view.default(cat_12, [1, 12, 832, -1]); cat_12 = None
mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None
return (permute_28, clone)
V0627 17:31:05.639000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "ee4f5da4b7396f62d53589c7ddc358c5"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]); select_2 = None
view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64])
permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]); bmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None
add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_2, [-1], True)
sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, amax); add_2 = amax = None
exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1); sub_1 = None
sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None
convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None
view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64])
bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = None
view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]); select_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1]); arg14_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]); clone_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
cat: "i32[132, 3][3, 1]cpu" =[arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.reshape.default(cat, [12, 11, 3]); cat = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(view, torch.int64); view = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
view_7: "i64[396][1]cpu" = torch.ops.aten.reshape.default(unsqueeze, [-1])
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None
mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13); div = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]); index_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]); view_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat =
cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_3, select_4, select_5, select_6, select_7], 2); select_4 = select_5 = select_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]); cat_1 = None
permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
full_default: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad =
cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" =[slice_30, slice_34, full_default], 3); slice_30 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
full_default_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1)
unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None
unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None
permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
select: "f32[13, 64][64, 1]cpu" =, 0, 0); arg13_1 = None
select_1: "i64[12, 11, 3][33, 3, 1]cpu" =, 0, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
view_1: "i64[396][1]cpu" = torch.ops.aten.reshape.default(select_1, [396]); select_1 = None
index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.reshape.default(index, [1, 396, 64]); index = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None
permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None
mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" =, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad =
cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" =[full_default_1, select_14], 3); select_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None
sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None
mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None
add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_3, [-1], True)
sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_3, amax_1); add_3 = amax_1 = None
exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3); sub_3 = None
sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None
convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1]); arg15_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, 2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]); clone_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add]); view_12 = add = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]); index_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]); view_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, 0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat =
cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_8, select_9, select_10, select_11, select_12], 2); select_9 = select_10 = select_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]); cat_2 = None
bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None
view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None
permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None
view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]); permute_8 = None
unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_3, 4)
unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None
permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None
permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None
view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]); permute_9 = None
bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34); view_34 = None
view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None
permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None
view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64)
unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3)
sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None
mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None
add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None
convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format); slice_57 = None
view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]); clone_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3)
slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2)
slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix =
cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" =[slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]); cat_5 = None
permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4); permute_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None
mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None
add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None
convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None
view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]); clone_6 = None
permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_5); view_27 = permute_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1)
sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None
mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None
add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None
convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_6, 4)
unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None
permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None
permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None
view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]); permute_14 = None
bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_38); view_33 = view_38 = None
view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None
permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None
view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_34, 3); slice_34 = None
sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None
mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None
add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None
convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product =
cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" =[convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None
amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None
exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8); sub_8 = None
sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None
convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]); slice_80 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3)
slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2)
slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix =
cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" =[slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]); cat_6 = None
bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None
view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]); slice_85 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None
view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]); clone_7 = None
bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None
view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None
view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]); add_8 = None
view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]); view_47 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None
permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None
permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None
view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]); permute_18 = None
unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_8, 4)
unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None
permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None
permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None
view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]); permute_19 = None
bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None
view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None
permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None
view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None
add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None
view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]); add_9 = None
view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]); view_53 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807); convert_element_type_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None
permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None
permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None
view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]); permute_23 = None
unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_11, 4)
unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None
permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None
permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None
view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]); permute_24 = None
bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None
view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None
permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None
view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None
add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None
view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]); add_10 = None
view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]); view_59 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -2)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]); select_29 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -3)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -2); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, -1); view_10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat =
cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_3, select_20, select_21, select_6, select_23], 2); select_3 = select_20 = select_21 = select_6 = select_23 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]); cat_8 = None
permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807); arg16_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad =
cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" =[slice_68, slice_132, full_default], 3); slice_68 = slice_132 = full_default = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" =, 2, -1); mul = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad =
cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" =[full_default_1, select_30], 3); full_default_1 = select_30 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None
sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None
mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None
add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_11, [-1], True)
sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_11, amax_3); add_11 = amax_3 = None
exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10); sub_10 = None
sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None
convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -3)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -2); view_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" =, 2, -1); view_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat =
cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" =[select_8, select_25, select_26, select_11, select_28], 2); select_8 = select_25 = select_26 = select_11 = select_28 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]); cat_9 = None
bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None
view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" =, 2, -1); view_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]); select_31 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: return torch.bmm(
bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_2); view_67 = permute_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_4); mul_17 = mul_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_12, [-1], True)
sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_12, amax_4); add_12 = amax_4 = None
exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12); sub_12 = None
sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None
convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None
bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_19); view_70 = view_19 = None
view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer =
cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" =[unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]); cat_12 = None
mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None
return (permute_28, unsqueeze)
V0627 17:31:09.541000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/6i/"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "47217ba55691917867319806954aafb8"}
# AOT ID: ['8_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused__softmax_add_mul_rsub_0 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
bfloat16* out_ptr3)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13);
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp14 = out_ptr0[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp15 = at::vec::Vectorized<float>(tmp14);
auto tmp16 = tmp13 - tmp15;
auto tmp17 = tmp16.exp(); + static_cast<long>(x1 + (832L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp17;
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp1 = out_ptr2[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3); + static_cast<long>(x1 + (832L*x0)), 16);
cpp_fused__to_copy_cat_stack_1 = async_compile.cpp_pybinding(['const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const bfloat16*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int64_t*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int32_t* in_ptr0,
const int32_t* in_ptr1,
const int32_t* in_ptr2,
const int32_t* in_ptr3,
const int32_t* in_ptr4,
const int32_t* in_ptr5,
const int32_t* in_ptr6,
const int32_t* in_ptr7,
const int32_t* in_ptr8,
const int32_t* in_ptr9,
const int32_t* in_ptr10,
const int32_t* in_ptr11,
const int32_t* in_ptr12,
const bfloat16* in_ptr13,
int32_t* out_ptr0,
int32_t* out_ptr1,
int32_t* out_ptr2,
int32_t* out_ptr3,
int32_t* out_ptr4,
int32_t* out_ptr5,
int32_t* out_ptr6,
int32_t* out_ptr7,
int32_t* out_ptr8,
int32_t* out_ptr9,
int32_t* out_ptr10,
int32_t* out_ptr11,
int64_t* out_ptr12,
bfloat16* out_ptr13,
bfloat16* out_ptr14,
bfloat16* out_ptr15,
bfloat16* out_ptr16,
bfloat16* out_ptr17,
bfloat16* out_ptr18,
bfloat16* out_ptr19,
bfloat16* out_ptr20)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr0 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr0[static_cast<long>(x0)];
out_ptr0[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr1 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr1[static_cast<long>(x0)];
out_ptr1[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr2 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr2[static_cast<long>(x0)];
out_ptr2[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr3 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr3[static_cast<long>(x0)];
out_ptr3[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr4 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr4[static_cast<long>(x0)];
out_ptr4[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr5 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr5[static_cast<long>(x0)];
out_ptr5[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr6 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr6[static_cast<long>(x0)];
out_ptr6[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr7 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr7[static_cast<long>(x0)];
out_ptr7[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr8 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr8[static_cast<long>(x0)];
out_ptr8[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr9 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr9[static_cast<long>(x0)];
out_ptr9[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr10 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr10[static_cast<long>(x0)];
out_ptr10[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr11 + static_cast<long>(x0), 16); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr11[static_cast<long>(x0)];
out_ptr11[static_cast<long>(x0)] = tmp0;
for(long x0=static_cast<long>(0L); x0<static_cast<long>(384L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr12 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<int64_t,2,int32_t,1>(tmp0); + static_cast<long>(x0), 16);
#pragma omp simd simdlen(8)
for(long x0=static_cast<long>(384L); x0<static_cast<long>(396L); x0+=static_cast<long>(1L))
auto tmp0 = in_ptr12[static_cast<long>(x0)];
auto tmp1 = c10::convert<int64_t>(tmp0);
out_ptr12[static_cast<long>(x0)] = tmp1;
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
auto tmp0 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))];
auto tmp13 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))];
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L));
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = 156L;
auto tmp5 = c10::convert<int64_t>(tmp4);
auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
auto tmp7 = tmp3 < 0;
auto tmp8 = tmp7 ? tmp6 : tmp3;
auto tmp9 = tmp8;
auto tmp10 = c10::convert<int64_t>(tmp9);
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
auto tmp12 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L));
auto tmp15 = c10::convert<int64_t>(tmp14);
auto tmp16 = decltype(tmp13)(tmp13 + tmp15);
auto tmp17 = decltype(tmp16)(tmp16 + tmp5);
auto tmp18 = tmp16 < 0;
auto tmp19 = tmp18 ? tmp17 : tmp16;
auto tmp20 = tmp19;
auto tmp21 = c10::convert<int64_t>(tmp20);
TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L");
auto tmp23 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))];
out_ptr19[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12;
out_ptr20[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23;
cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
const int64_t* in_ptr2,
const bfloat16* in_ptr3,
const float* in_ptr4,
const float* in_ptr5,
const bfloat16* in_ptr6,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
float* out_ptr7,
float* out_ptr8,
bfloat16* out_ptr9,
bfloat16* out_ptr10,
bfloat16* out_ptr11,
bfloat16* out_ptr12,
bfloat16* out_ptr13,
bfloat16* out_ptr14,
bfloat16* out_ptr15,
bfloat16* out_ptr16,
bfloat16* out_ptr17)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); + static_cast<long>(x0));
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(768L + x0), 16); + static_cast<long>(x0));
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0); + static_cast<long>(x0));
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L))
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0); + static_cast<long>(x1 + (448L*x0)));
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
auto tmp0 = in_ptr1[static_cast<long>(64L + x1)];
auto tmp1 =
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((x2 + x2_inner + (2112L*x0)), 64L))];
return at::vec::VectorizedN<int64_t,2>::loadu(, 16);
auto tmp16 = in_ptr1[static_cast<long>(704L + x1)];
auto tmp17 =
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((1920L + x2 + x2_inner + (2112L*x0)), 64L))];
return at::vec::VectorizedN<int64_t,2>::loadu(, 16);
auto tmp2 = 13L;
auto tmp3 = c10::convert<int64_t>(tmp2);
auto tmp4 = at::vec::VectorizedN<int64_t,2>(tmp3);
auto tmp5 = tmp1 + tmp4;
auto tmp6 = static_cast<int64_t>(0);
auto tmp7 = at::vec::VectorizedN<int64_t,2>(tmp6);
auto tmp8 = at::vec::VecMask<int64_t,2>(tmp1 < tmp7);
auto tmp9 = decltype(tmp5)::blendv(tmp1, tmp5, tmp8.template cast<int64_t,2>());
auto tmp10 =
__at_align__ std::array<int64_t, 16> tmpbuf;;
return tmpbuf;
auto tmp11 =
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
tmpbuf[x2_inner] = static_cast<long>(tmp10[x2_inner]);
return at::vec::VectorizedN<int64_t,2>::loadu(, 16);
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp11) & (tmp11 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp11 < 13L");
auto tmp13 =
__at_align__ std::array<float, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp10[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))];
return at::vec::Vectorized<float>::loadu(, 16);
auto tmp14 = at::vec::Vectorized<float>(tmp0);
auto tmp15 = tmp14 * tmp13;
auto tmp18 = tmp17 + tmp4;
auto tmp19 = at::vec::VecMask<int64_t,2>(tmp17 < tmp7);
auto tmp20 = decltype(tmp18)::blendv(tmp17, tmp18, tmp19.template cast<int64_t,2>());
auto tmp21 =
__at_align__ std::array<int64_t, 16> tmpbuf;;
return tmpbuf;
auto tmp22 =
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
tmpbuf[x2_inner] = static_cast<long>(tmp21[x2_inner]);
return at::vec::VectorizedN<int64_t,2>::loadu(, 16);
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp22) & (tmp22 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp22 < 13L");
auto tmp24 =
__at_align__ std::array<float, 16> tmpbuf;
#pragma GCC unroll 16
for (long x2_inner = 0; x2_inner < 16; x2_inner++)
tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp21[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))];
return at::vec::Vectorized<float>::loadu(, 16);
auto tmp25 = at::vec::Vectorized<float>(tmp16);
auto tmp26 = tmp25 * tmp24; + static_cast<long>(x2 + (448L*x1) + (28672L*x0))); + static_cast<long>(x2 + (448L*x1) + (28672L*x0)));
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15);
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp16 = out_ptr6[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = tmp15 - tmp17;
auto tmp19 = tmp18.exp(); + static_cast<long>(x1 + (448L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp19;
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr8[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr7 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = out_ptr8[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3); + static_cast<long>(x1 + (448L*x0)), 16);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
auto tmp0 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))];
auto tmp13 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))];
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L));
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = 156L;
auto tmp5 = c10::convert<int64_t>(tmp4);
auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
auto tmp7 = tmp3 < 0;
auto tmp8 = tmp7 ? tmp6 : tmp3;
auto tmp9 = tmp8;
auto tmp10 = c10::convert<int64_t>(tmp9);
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
auto tmp12 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L));
auto tmp15 = c10::convert<int64_t>(tmp14);
auto tmp16 = decltype(tmp13)(tmp13 + tmp15);
auto tmp17 = decltype(tmp16)(tmp16 + tmp5);
auto tmp18 = tmp16 < 0;
auto tmp19 = tmp18 ? tmp17 : tmp16;
auto tmp20 = tmp19;
auto tmp21 = c10::convert<int64_t>(tmp20);
TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L");
auto tmp23 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))];
out_ptr16[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12;
out_ptr17[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23;
cpp_fused_cat_clone_3 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const bfloat16* in_ptr1,
bfloat16* out_ptr0,
bfloat16* out_ptr1,
bfloat16* out_ptr2,
bfloat16* out_ptr3)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (36864L*x0)), 32);
cpp_fused_clone_4 = async_compile.cpp_pybinding(['const int64_t*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const int64_t* in_ptr0,
const bfloat16* in_ptr1,
const bfloat16* in_ptr2,
bfloat16* out_ptr0,
bfloat16* out_ptr1)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(1L))
auto tmp0 = in_ptr0[static_cast<long>((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L)))];
auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L))), 33L));
auto tmp2 = c10::convert<int64_t>(tmp1);
auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
auto tmp4 = 156L;
auto tmp5 = c10::convert<int64_t>(tmp4);
auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
auto tmp7 = tmp3 < 0;
auto tmp8 = tmp7 ? tmp6 : tmp3;
auto tmp9 = tmp8;
auto tmp10 = c10::convert<int64_t>(tmp9);
TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
auto tmp12 = in_ptr1[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
auto tmp13 = in_ptr2[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
out_ptr0[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp12;
out_ptr1[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp13;
cpp_fused__softmax__to_copy_add_cat_mul_rsub_5 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
const bfloat16* in_ptr2,
const float* in_ptr3,
const bfloat16* in_ptr4,
const float* in_ptr5,
const int64_t* in_ptr6,
const bfloat16* in_ptr7,
const bfloat16* in_ptr8,
const bfloat16* in_ptr9,
bfloat16* out_ptr0,
bfloat16* out_ptr1,
bfloat16* out_ptr2,
bfloat16* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
bfloat16* out_ptr7,
bfloat16* out_ptr8,
bfloat16* out_ptr9,
bfloat16* out_ptr10)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (64L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp14 = at::vec::convert<bfloat16>(tmp13); + static_cast<long>(x1 + (512L*x0)), 16);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x2 + (192L*x1) + (110592L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x2 + (192L*x1)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp14 = at::vec::convert<bfloat16>(tmp13); + static_cast<long>(x2 + (512L*x1) + (294912L*x0)), 16);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
for(long x3=static_cast<long>(0L); x3<static_cast<long>(192L); x3+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x3 + (192L*x2) + (12288L*x1) + (110592L*x0)), 16);
auto tmp6 = in_ptr5[static_cast<long>(128L + x2 + (64L*x1))];
auto tmp7 =
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x3_inner = 0; x3_inner < 16; x3_inner++)
tmpbuf[x3_inner] = in_ptr6[static_cast<long>(c10::div_floor_integer((192L + x3 + x3_inner + (192L*x1) + (2112L*x0)), 64L))];
return at::vec::VectorizedN<int64_t,2>::loadu(, 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = 13L;
auto tmp9 = c10::convert<int64_t>(tmp8);
auto tmp10 = at::vec::VectorizedN<int64_t,2>(tmp9);
auto tmp11 = tmp7 + tmp10;
auto tmp12 = static_cast<int64_t>(0);
auto tmp13 = at::vec::VectorizedN<int64_t,2>(tmp12);
auto tmp14 = at::vec::VecMask<int64_t,2>(tmp7 < tmp13);
auto tmp15 = decltype(tmp11)::blendv(tmp7, tmp11, tmp14.template cast<int64_t,2>());
auto tmp16 =
__at_align__ std::array<int64_t, 16> tmpbuf;;
return tmpbuf;
auto tmp17 =
__at_align__ std::array<int64_t, 16> tmpbuf;
#pragma GCC unroll 16
for (long x3_inner = 0; x3_inner < 16; x3_inner++)
tmpbuf[x3_inner] = static_cast<long>(tmp16[x3_inner]);
return at::vec::VectorizedN<int64_t,2>::loadu(, 16);
TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp17) & (tmp17 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp17 < 13L");
auto tmp19 =
__at_align__ std::array<float, 16> tmpbuf;
#pragma GCC unroll 16
for (long x3_inner = 0; x3_inner < 16; x3_inner++)
tmpbuf[x3_inner] = in_ptr5[static_cast<long>((64L*tmp16[x3_inner]) + (static_cast<long>((x3 + x3_inner)) % static_cast<long>(64L)))];
return at::vec::Vectorized<float>::loadu(, 16);
auto tmp20 = at::vec::Vectorized<float>(tmp6);
auto tmp21 = tmp20 * tmp19;
auto tmp22 = static_cast<float>(1.0);
auto tmp23 = at::vec::Vectorized<float>(tmp22);
auto tmp24 = tmp23 - tmp21;
auto tmp25 = static_cast<float>(-10000.0);
auto tmp26 = at::vec::Vectorized<float>(tmp25);
auto tmp27 = tmp24 * tmp26;
auto tmp28 = tmp5 + tmp27;
auto tmp29 = at::vec::convert<bfloat16>(tmp28); + static_cast<long>(x3 + (512L*x2) + (32768L*x1) + (294912L*x0)), 16);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (64L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(768L + x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp14 = at::vec::convert<bfloat16>(tmp13); + static_cast<long>(x1 + (512L*x0)), 16);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp1);
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16);
auto tmp2 = out_ptr4[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = tmp4.exp(); + static_cast<long>(x1 + (512L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp5;
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (512L*x0)), 16);
auto tmp1 = out_ptr6[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3); + static_cast<long>(x1 + (512L*x0)), 16);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32); + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
cpp_fused_cat_6 = async_compile.cpp_pybinding(['const bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
bfloat16* out_ptr0,
bfloat16* out_ptr1)
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
cpp_fused__softmax_add_cat_minimum_mul_rsub_7 = async_compile.cpp_pybinding(['const float*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
const bfloat16* in_ptr1,
const float* in_ptr2,
const float* in_ptr3,
const bfloat16* in_ptr4,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4,
float* out_ptr5,
float* out_ptr6,
bfloat16* out_ptr7,
bfloat16* out_ptr8,
bfloat16* out_ptr9)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16); + static_cast<long>(x0));
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(640L + x0), 16); + static_cast<long>(x0));
for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0); + static_cast<long>(x0));
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L))
auto tmp0 = static_cast<float>(1.0);
auto tmp1 = at::vec::Vectorized<float>(tmp0); + static_cast<long>(x1 + (448L*x0)));
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15);
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp16 = out_ptr4[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp8 = at::vec::minimum(tmp6, tmp7);
auto tmp9 = static_cast<float>(1.0);
auto tmp10 = at::vec::Vectorized<float>(tmp9);
auto tmp11 = tmp10 - tmp8;
auto tmp12 = static_cast<float>(-10000.0);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp11 * tmp13;
auto tmp15 = tmp5 + tmp14;
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = tmp15 - tmp17;
auto tmp19 = tmp18.exp(); + static_cast<long>(x1 + (448L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp19;
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
auto tmp1 = out_ptr6[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3); + static_cast<long>(x1 + (448L*x0)), 16);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32); + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
cpp_fused__softmax_add_mul_rsub_8 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
bfloat16* out_ptr3)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13);
tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
float tmp_acc0 = 0;
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
auto tmp14 = out_ptr0[static_cast<long>(x0)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.125);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = (tmp4);
auto tmp7 = static_cast<float>(1.0);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp8 - tmp6;
auto tmp10 = static_cast<float>(-10000.0);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp9 * tmp11;
auto tmp13 = tmp5 + tmp12;
auto tmp15 = at::vec::Vectorized<float>(tmp14);
auto tmp16 = tmp13 - tmp15;
auto tmp17 = tmp16.exp(); + static_cast<long>(x1 + (832L*x0)));
tmp_acc0_vec = tmp_acc0_vec + tmp17;
tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16);
auto tmp1 = out_ptr2[static_cast<long>(x0)];
auto tmp2 = at::vec::Vectorized<float>(tmp1);
auto tmp3 = tmp0 / tmp2;
auto tmp4 = at::vec::convert<bfloat16>(tmp3); + static_cast<long>(x1 + (832L*x0)), 16);
cpp_fused_cat_mul_9 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const bfloat16* in_ptr1,
const bfloat16* in_ptr2,
const bfloat16* in_ptr3,
const bfloat16* in_ptr4,
const bfloat16* in_ptr5,
const bfloat16* in_ptr6,
const bfloat16* in_ptr7,
const bfloat16* in_ptr8,
const float* in_ptr9,
bfloat16* out_ptr0,
bfloat16* out_ptr1,
bfloat16* out_ptr2,
bfloat16* out_ptr3,
bfloat16* out_ptr4,
float* out_ptr5)
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (4096L*x0)), 32); + static_cast<long>(x1 + (53248L*x0)), 32);
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (4096L*x0)), 32); + static_cast<long>(x1 + (53248L*x0)), 32);
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(36864L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp2 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp5 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp8 = at::vec::Vectorized<bfloat16>::loadu(in_ptr5 + static_cast<long>(x1 + (36864L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = at::vec::convert<float>(tmp2);
auto tmp4 = tmp1 + tmp3;
auto tmp6 = at::vec::convert<float>(tmp5);
auto tmp7 = tmp4 + tmp6;
auto tmp9 = at::vec::convert<float>(tmp8);
auto tmp10 = tmp7 + tmp9;
auto tmp11 = at::vec::convert<bfloat16>(tmp10); + static_cast<long>(x1 + (53248L*x0)), 16);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x1 + (4096L*x0)), 32); + static_cast<long>(x1 + (53248L*x0)), 32);
#pragma omp single
#pragma GCC ivdep
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (4096L*x0)), 32); + static_cast<long>(x1 + (53248L*x0)), 32);
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x2 + (64L*x1) + (53248L*x0)), 16);
auto tmp2 = in_ptr9[static_cast<long>(x1)];
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3; + static_cast<long>(x2 + (64L*x1) + (53248L*x0)));
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1 = args
assert_size_stride(arg0_1, (11, 3), (3, 1))
assert_size_stride(arg1_1, (11, 3), (3, 1))
assert_size_stride(arg2_1, (11, 3), (3, 1))
assert_size_stride(arg3_1, (11, 3), (3, 1))
assert_size_stride(arg4_1, (11, 3), (3, 1))
assert_size_stride(arg5_1, (11, 3), (3, 1))
assert_size_stride(arg6_1, (11, 3), (3, 1))
assert_size_stride(arg7_1, (11, 3), (3, 1))
assert_size_stride(arg8_1, (11, 3), (3, 1))
assert_size_stride(arg9_1, (11, 3), (3, 1))
assert_size_stride(arg10_1, (11, 3), (3, 1))
assert_size_stride(arg11_1, (11, 3), (3, 1))
assert_size_stride(arg12_1, (1, 12, 832, 64), (638976, 64, 768, 1))
assert_size_stride(arg13_1, (1, 13, 64), (832, 64, 1))
assert_size_stride(arg14_1, (1, 12, 832, 64), (638976, 64, 768, 1))
assert_size_stride(arg15_1, (1, 12, 832, 64), (638976, 64, 768, 1))
assert_size_stride(arg16_1, (1, 1, 1, 832), (832, 832, 832, 1))
assert_size_stride(arg17_1, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1))
assert_size_stride(arg18_1, (1, 1, 832, 1), (832, 832, 1, 1))
buf0 = empty_strided_cpu((12, 64, 832), (53248, 832, 1), torch.bfloat16)
# Source Nodes: [bmm], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 0), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf0)
buf1 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32)
buf2 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.float32)
buf3 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32)
buf4 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.bfloat16)
cpp_fused__softmax_add_mul_rsub_0(buf0, arg16_1, buf1, buf2, buf3, buf4)
buf5 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_1], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf5)
buf18 = empty_strided_cpu((132, 3), (3, 1), torch.int32)
buf6 = reinterpret_tensor(buf18, (11, 3), (3, 1), 0) # alias
buf7 = reinterpret_tensor(buf18, (11, 3), (3, 1), 33) # alias
buf8 = reinterpret_tensor(buf18, (11, 3), (3, 1), 66) # alias
buf9 = reinterpret_tensor(buf18, (11, 3), (3, 1), 99) # alias
buf10 = reinterpret_tensor(buf18, (11, 3), (3, 1), 132) # alias
buf11 = reinterpret_tensor(buf18, (11, 3), (3, 1), 165) # alias
buf12 = reinterpret_tensor(buf18, (11, 3), (3, 1), 198) # alias
buf13 = reinterpret_tensor(buf18, (11, 3), (3, 1), 231) # alias
buf14 = reinterpret_tensor(buf18, (11, 3), (3, 1), 264) # alias
buf15 = reinterpret_tensor(buf18, (11, 3), (3, 1), 297) # alias
buf16 = reinterpret_tensor(buf18, (11, 3), (3, 1), 330) # alias
buf17 = reinterpret_tensor(buf18, (11, 3), (3, 1), 363) # alias
buf19 = empty_strided_cpu((12, 11, 3), (33, 3, 1), torch.int64)
buf25 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf20 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf78 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf73 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf21 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf22 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
buf23 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf76 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf24 = reinterpret_tensor(buf25, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
buf77 = reinterpret_tensor(buf78, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
cpp_fused__to_copy_cat_stack_1(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, buf18, arg14_1, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf19, buf20, buf73, buf21, buf22, buf23, buf76, buf24, buf77)
del arg0_1
del arg10_1
del arg11_1
del arg1_1
del arg2_1
del arg3_1
del arg4_1
del arg5_1
del arg6_1
del arg7_1
del arg8_1
del arg9_1
del buf10
del buf11
del buf12
del buf13
del buf14
del buf15
del buf16
del buf17
del buf18
del buf20
del buf21
del buf22
del buf23
del buf24
del buf6
del buf7
del buf8
del buf9
buf26 = empty_strided_cpu((12, 64, 448), (28672, 448, 1), torch.bfloat16)
# Source Nodes: [bmm_2], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 49152), reinterpret_tensor(buf25, (12, 64, 448), (28672, 1, 64), 0), out=buf26)
buf30 = empty_strided_cpu((1, 1, 1, 448), (448, 448, 448, 1), torch.float32)
buf27 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 0) # alias
buf28 = reinterpret_tensor(buf30, (1, 1, 1, 64), (448, 448, 448, 1), 192) # alias
buf29 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 256) # alias
buf33 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
buf31 = reinterpret_tensor(buf33, (1, 12, 64, 256), (344064, 28672, 448, 1), 0) # alias
buf32 = reinterpret_tensor(buf33, (1, 12, 64, 192), (344064, 28672, 448, 1), 256) # alias
buf86 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
buf85 = reinterpret_tensor(buf86, (1, 12, 64, 192), (344064, 28672, 448, 1), 256) # alias
buf34 = buf3; del buf3 # reuse
buf35 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
buf36 = buf1; del buf1 # reuse
buf43 = reinterpret_tensor(buf25, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf25 # reuse
buf42 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf37 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf95 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
buf90 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 0) # alias
buf38 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf39 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
buf40 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf93 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288) # alias
buf41 = reinterpret_tensor(buf42, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
buf94 = reinterpret_tensor(buf95, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384) # alias
cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2(arg16_1, arg13_1, buf19, buf26, buf30, buf33, arg15_1, buf27, buf28, buf29, buf31, buf32, buf85, buf34, buf35, buf36, buf43, buf37, buf90, buf38, buf39, buf40, buf93, buf41, buf94)
del buf26
del buf27
del buf28
del buf29
del buf31
del buf32
del buf33
del buf37
del buf38
del buf39
del buf40
del buf41
buf44 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_3], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf42, (12, 448, 64), (28672, 64, 1), 0), out=buf44)
del buf42
buf45 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
# Source Nodes: [first_band_product], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 0), out=buf45)
buf49 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16)
buf46 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0) # alias
buf47 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096) # alias
buf48 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192) # alias
buf50 = empty_strided_cpu((1, 12, 9, 64, 64), (442368, 36864, 4096, 64, 1), torch.bfloat16)
cpp_fused_cat_clone_3(arg14_1, arg12_1, buf46, buf47, buf48, buf50)
del buf46
del buf47
del buf48
buf51 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16)
# Source Nodes: [bmm_4], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf49, (108, 64, 192), (12288, 1, 64), 0), out=buf51)
buf52 = buf49; del buf49 # reuse
buf69 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16)
cpp_fused_clone_4(buf19, arg14_1, arg15_1, buf52, buf69)
buf53 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16)
# Source Nodes: [bmm_5], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf52, (108, 64, 192), (12288, 1, 64), 0), out=buf53)
buf54 = reinterpret_tensor(buf50, (12, 576, 64), (36864, 64, 1), 0); del buf50 # reuse
# Source Nodes: [last_band_product], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 589824), out=buf54)
buf59 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16)
buf55 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 0) # alias
buf56 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 64) # alias
buf57 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 256) # alias
buf58 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 448) # alias
buf60 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32)
buf61 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.float32)
buf62 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32)
buf67 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16)
buf66 = buf52; del buf52 # reuse
buf63 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0) # alias
buf64 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096) # alias
buf65 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192) # alias
cpp_fused__softmax__to_copy_add_cat_mul_rsub_5(buf45, arg16_1, buf51, arg17_1, buf53, arg13_1, buf19, buf54, buf59, arg15_1, buf55, buf56, buf57, buf58, buf60, buf61, buf62, buf67, buf63, buf64, buf65)
del arg13_1
del arg17_1
del buf51
del buf53
del buf55
del buf56
del buf57
del buf58
del buf59
del buf60
del buf61
del buf62
del buf63
del buf64
del buf65
buf68 = reinterpret_tensor(buf54, (108, 64, 64), (4096, 64, 1), 0); del buf54 # reuse
# Source Nodes: [bmm_6], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 64), reinterpret_tensor(buf66, (108, 192, 64), (12288, 64, 1), 0), out=buf68)
del buf66
buf70 = reinterpret_tensor(buf45, (108, 64, 64), (4096, 64, 1), 0); del buf45 # reuse
# Source Nodes: [bmm_7], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 256), reinterpret_tensor(buf69, (108, 192, 64), (12288, 64, 1), 0), out=buf70)
del buf69
buf71 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
# Source Nodes: [einsum_3], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 0), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 0), out=buf71)
buf72 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
# Source Nodes: [einsum_4], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 448), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 589824), out=buf72)
del buf67
buf74 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf75 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
cpp_fused_cat_6(arg14_1, buf74, buf75)
del buf73
del buf74
del buf75
del buf76
del buf77
buf79 = reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0); del buf43 # reuse
# Source Nodes: [bmm_8], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 540672), reinterpret_tensor(buf78, (12, 64, 448), (28672, 1, 64), 0), out=buf79)
buf83 = buf30; del buf30 # reuse
buf80 = reinterpret_tensor(buf83, (1, 1, 1, 64), (448, 448, 448, 1), 0) # alias
buf81 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 64) # alias
buf82 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 256) # alias
buf84 = reinterpret_tensor(buf86, (1, 12, 64, 256), (344064, 28672, 448, 1), 0) # alias
buf87 = buf36; del buf36 # reuse
buf88 = buf35; del buf35 # reuse
buf89 = buf34; del buf34 # reuse
buf96 = reinterpret_tensor(buf78, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf78 # reuse
buf91 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096) # alias
buf92 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192) # alias
cpp_fused__softmax_add_cat_minimum_mul_rsub_7(arg16_1, buf79, buf83, buf86, arg15_1, buf80, buf81, buf82, buf84, buf87, buf88, buf89, buf96, buf91, buf92)
del buf79
del buf80
del buf81
del buf82
del buf83
del buf84
del buf85
del buf86
del buf88
del buf90
del buf91
del buf92
del buf93
del buf94
buf97 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_9], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf96, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf95, (12, 448, 64), (28672, 64, 1), 0), out=buf97)
del buf95
del buf96
buf98 = reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0); del buf4 # reuse
# Source Nodes: [bmm_10], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 589824), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf98)
del arg12_1
del arg14_1
buf99 = buf89; del buf89 # reuse
buf100 = buf2; del buf2 # reuse
buf101 = buf87; del buf87 # reuse
buf102 = reinterpret_tensor(buf0, (1, 12, 64, 832), (638976, 53248, 832, 1), 0); del buf0 # reuse
cpp_fused__softmax_add_mul_rsub_8(buf98, arg16_1, buf99, buf100, buf101, buf102)
del arg16_1
del buf101
del buf98
del buf99
buf103 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
# Source Nodes: [bmm_11], Original ATen: [aten.bmm]
extern_kernels.bmm(reinterpret_tensor(buf102, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf103)
del arg15_1
buf109 = reinterpret_tensor(buf102, (1, 12, 13, 64, 64), (638976, 53248, 4096, 64, 1), 0); del buf102 # reuse
buf104 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 0) # alias
buf105 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 4096) # alias
buf106 = reinterpret_tensor(buf109, (1, 12, 9, 64, 64), (638976, 53248, 4096, 64, 1), 8192) # alias
buf107 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 45056) # alias
buf108 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 49152) # alias
buf110 = reinterpret_tensor(buf100, (1, 12, 832, 64), (638976, 53248, 64, 1), 0); del buf100 # reuse
cpp_fused_cat_mul_9(buf5, buf44, buf68, buf70, buf71, buf72, buf97, buf103, buf109, arg18_1, buf104, buf105, buf106, buf107, buf108, buf110)
del arg18_1
return (reinterpret_tensor(buf110, (1, 832, 12, 64), (638976, 64, 53248, 1), 0), reinterpret_tensor(buf19, (1, 12, 11, 3), (396, 33, 3, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg1_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg2_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg3_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg4_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg5_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg6_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg7_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg8_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg9_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg10_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg11_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
arg12_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
arg13_1 = rand_strided((1, 13, 64), (832, 64, 1), device='cpu', dtype=torch.float32)
arg14_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
arg15_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
arg16_1 = rand_strided((1, 1, 1, 832), (832, 832, 832, 1), device='cpu', dtype=torch.float32)
arg17_1 = rand_strided((1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), device='cpu', dtype=torch.float32)
arg18_1 = rand_strided((1, 1, 832, 1), (832, 832, 1, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:09.567000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:09.568000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "81a28a443bd0d99705f0b5d2b9a46edc"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['bsz'], accessed_by=DictGetItemGuardAccessor(bsz)
| | +- EQUALS_MATCH: L['bsz'] == 1
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
| | +- EQUALS_MATCH: L['n_heads'] == 12
| +- GuardManager: source=L['rsqrt_d'], accessed_by=DictGetItemGuardAccessor(rsqrt_d)
| | +- EQUALS_MATCH: L['rsqrt_d'] == 0.125
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
| | +- LENGTH_CHECK: len(L['___stack0']) == 12
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['key_layer'], accessed_by=DictGetItemGuardAccessor(key_layer)
| | +- TENSOR_MATCH: check_tensor(L['key_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
| | +- NO_HASATTR: hasattr(L['key_layer'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['query_layer'], accessed_by=DictGetItemGuardAccessor(query_layer)
| | +- TENSOR_MATCH: check_tensor(L['query_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
| | +- NO_HASATTR: hasattr(L['query_layer'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['value_layer'], accessed_by=DictGetItemGuardAccessor(value_layer)
| | +- TENSOR_MATCH: check_tensor(L['value_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
| | +- NO_HASATTR: hasattr(L['value_layer'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['attn_mask_penalty'], accessed_by=DictGetItemGuardAccessor(attn_mask_penalty)
| | +- EQUALS_MATCH: L['attn_mask_penalty'] == -10000.0
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn)
| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680)
| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional)
| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024)
| | | | +- GuardManager: source=G['nn'].functional.softmax, accessed_by=GetAttrGuardAccessor(softmax)
| | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.softmax, 139842422997488)
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].stack, accessed_by=GetAttrGuardAccessor(stack)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].stack, 139844763318256)
| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
| | | +- GuardManager: source=G['torch'].bmm, accessed_by=GetAttrGuardAccessor(bmm)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].bmm, 139845228834192)
| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672)
| | | +- GuardManager: source=G['torch'].div, accessed_by=GetAttrGuardAccessor(div)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].div, 139845228790304)
| | | +- GuardManager: source=G['torch'].long, accessed_by=GetAttrGuardAccessor(long)
| | | | +- EQUALS_MATCH: G['torch'].long == torch.int64
| | | +- GuardManager: source=G['torch'].stack, accessed_by=GetAttrGuardAccessor(stack)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].stack, 139845228799024)
| | | +- GuardManager: source=G['torch'].arange, accessed_by=GetAttrGuardAccessor(arange)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].arange, 139845228706960)
| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568)
| | | +- GuardManager: source=G['torch'].tensor, accessed_by=GetAttrGuardAccessor(tensor)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].tensor, 139845228703840)
| | | +- GuardManager: source=G['torch'].minimum, accessed_by=GetAttrGuardAccessor(minimum)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].minimum, 139845228824272)
| | | +- GuardManager: source=G['torch'].transpose, accessed_by=GetAttrGuardAccessor(transpose)
| | | | +- ID_MATCH: ___check_obj_id(G['torch'].transpose, 139845228736688)
| | +- GuardManager: source=G['__builtins_dict___46'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___46)
| | | +- GuardManager: source=G['__builtins_dict___46']['len'], accessed_by=DictGetItemGuardAccessor(len)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['len'], 139845257826832)
| | | +- GuardManager: source=G['__builtins_dict___46']['zip'], accessed_by=DictGetItemGuardAccessor(zip)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['zip'], 7491872)
| | | +- GuardManager: source=G['__builtins_dict___46']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['range'], 7632448)
| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:09.568000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "15/0", "frame_key": "20", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 583, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 58, "shape_env_guard_count": 0, "graph_op_count": 208, "graph_node_count": 228, "graph_input_count": 19, "start_time": 1719534664.260442, "entire_frame_compile_time_s": 5.308261871337891, "backend_compile_time_s": 5.101780414581299, "inductor_compile_time_s": 4.007972240447998, "code_gen_time_s": 3.5389716625213623, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.580000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 40, "size": 2555904}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 12, 64], "is_leaf": true, "stride": [638976, 64, 53248, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dbd80>", "describer_id": 40}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 40, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.587000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 12, 64], "contiguous": [1, 832, 12, 64], "context_layer": [1, 832, 768]}}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "153b3dc8bb7ea7326b02a24531cf2b23"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
l_stack0_0_ = L_stack0_0_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
contiguous: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l_stack0_0_.contiguous(); l_stack0_0_ = None
context_layer: "f32[1, 832, 768][638976, 768, 1]cpu" = contiguous.view(1, 832, -1); contiguous = None
return (context_layer,)
V0627 17:31:09.599000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "4088b7608c41845b848a0fa539961d1e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format); arg0_1 = None
view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(clone, [1, 832, -1]); clone = None
return (view,)
V0627 17:31:09.609000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "33da1fe849e643eaf3458df62aaeea7e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format); arg0_1 = None
view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.reshape.default(clone, [1, 832, -1]); clone = None
return (view,)
V0627 17:31:09.703000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/7l/"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "675b3bf5875d915c125bff4b02eb31f4"}
# AOT ID: ['9_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
cpp_fused_clone_0 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
float* out_ptr0)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(12L); x1+=static_cast<long>(1L))
for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x2 + (64L*x0) + (53248L*x1)), 16); + static_cast<long>(x2 + (64L*x1) + (768L*x0)));
del async_compile
def call(args):
arg0_1, = args
assert_size_stride(arg0_1, (1, 832, 12, 64), (638976, 64, 53248, 1))
buf0 = empty_strided_cpu((1, 832, 12, 64), (638976, 768, 64, 1), torch.float32)
cpp_fused_clone_0(arg0_1, buf0)
del arg0_1
return (reinterpret_tensor(buf0, (1, 832, 768), (638976, 768, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((1, 832, 12, 64), (638976, 64, 53248, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg0_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:09.710000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:09.710000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "832b6bdf2f2092cb0e2ca7f3e3a30237"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 2
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 12, 64], stride=[638976, 64, 53248, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1)
| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][1], 7636800)
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length)
| | +- EQUALS_MATCH: L['from_seq_length'] == 832
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
V0627 17:31:09.710000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "16/0", "frame_key": "21", "co_name": "torch_dynamo_resume_in_forward_at_472", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 11, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 4, "graph_input_count": 1, "start_time": 1719534669.5804062, "entire_frame_compile_time_s": 0.13004136085510254, "backend_compile_time_s": 0.12020564079284668, "inductor_compile_time_s": 0.09919452667236328, "code_gen_time_s": 0.08350419998168945, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.711000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30911a30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 42, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 3, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 42, "id": 4, "source": "L['hidden_states']"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.731000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "l_hidden_states_": [1, 832, 768], "hidden_states": [1, 832, 768], "hidden_states_1": [1, 832, 768], "add": [1, 832, 768], "hidden_states_2": [1, 832, 768]}}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "5cbaeaa3b94e9560f38738cbbbf2efd6"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu", L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_hidden_states_ = L_hidden_states_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
hidden_states: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(l_stack0_0_); l_stack0_0_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dropout(hidden_states)
hidden_states_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states); hidden_states = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_1 + l_hidden_states_; hidden_states_1 = l_hidden_states_ = None
hidden_states_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add); add = None
return (hidden_states_2,)
V0627 17:31:09.771000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "1cd1232b8ea80a91453ce72d7309f42c"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768][1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dropout(hidden_states)
clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_1); view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg5_1); clone = arg5_1 = None
var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1); add_1 = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1); add = getitem_1 = None
mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, arg2_1); mul = arg2_1 = None
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, arg3_1); mul_1 = arg3_1 = None
return (add_2,)
V0627 17:31:09.822000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "e2b95d7d56d3ed2a8ad6cfb284f41613"}
class <lambda>(torch.nn.Module):
def forward(self, arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# No stacktrace found for following nodes
_frozen_param2: "f32[768][1]cpu" = self._frozen_param2
_frozen_param3: "f32[768][1]cpu" = self._frozen_param3
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
_frozen_param4: "bf16[768][1]cpu" = self._frozen_param4
# No stacktrace found for following nodes
_frozen_param6: "bf16[768, 768][1, 0]cpu" = self._frozen_param6
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
_linear_pointwise_default_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param6, _frozen_param4, 'none', [], ''); convert_element_type_2 = _frozen_param6 = _frozen_param4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_1, arg5_1); _linear_pointwise_default_1 = arg5_1 = None
var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1); add = getitem_1 = None
add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1); add_1 = None
mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, _frozen_param2); mul = _frozen_param2 = None
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None
return (add_2,)
V0627 17:31:09.919000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ot/"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "320320d26970537cad9fa4b92420ab78"}
# AOT ID: ['10_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param2 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44fe0
_frozen_param3 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45080
_frozen_param4 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e30928a90
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e303a2cf0
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0); + static_cast<long>(x0), 16);
cpp_fused_add_native_layer_norm_1 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
const float* in_ptr3,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps);
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp4 = out_ptr0[static_cast<long>(x0)];
auto tmp7 = out_ptr1[static_cast<long>(x0)];
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
auto tmp5 = at::vec::Vectorized<float>(tmp4);
auto tmp6 = tmp3 - tmp5;
auto tmp8 = static_cast<float>(768.0);
auto tmp9 = tmp7 / tmp8;
auto tmp10 = static_cast<float>(1e-12);
auto tmp11 = decltype(tmp9)(tmp9 + tmp10);
auto tmp12 = 1 / std::sqrt(tmp11);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp6 * tmp13;
auto tmp16 = tmp14 * tmp15;
auto tmp18 = tmp16 + tmp17; + static_cast<long>(x1 + (768L*x0)));
del async_compile
def call(args):
arg4_1, arg5_1 = args
assert_size_stride(arg4_1, (1, 832, 768), (638976, 768, 1))
assert_size_stride(arg5_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg4_1, buf0)
del arg4_1
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param6, _frozen_param4, 'none', [-1], '')
del buf0
buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf3 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf5 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
cpp_fused_add_native_layer_norm_1(buf1, arg5_1, _frozen_param2, _frozen_param3, buf2, buf3, buf5)
del arg5_1
return (buf5, )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param2
_frozen_param2 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param3
_frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param4
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param6
_frozen_param6 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
arg4_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
arg5_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg4_1, arg5_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:09.931000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:09.932000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "cc9600447bc28ad3ba928d7719c0654d"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272272)
| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__)
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202271456)
| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202271168)
| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202271504)
| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 1
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states'])
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks
V0627 17:31:09.932000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "17/0", "frame_key": "22", "co_name": "torch_dynamo_resume_in_forward_at_1401", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1401, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 34, "shape_env_guard_count": 0, "graph_op_count": 4, "graph_node_count": 7, "graph_input_count": 2, "start_time": 1719534669.711534, "entire_frame_compile_time_s": 0.22069621086120605, "backend_compile_time_s": 0.1933588981628418, "inductor_compile_time_s": 0.11173701286315918, "code_gen_time_s": 0.08121824264526367, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.933000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 44, "size": 2555904}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e309a5760>", "describer_id": 44}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 44, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.984000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "hidden_states": [1, 832, 3072], "mul": [1, 832, 3072], "pow_1": [1, 832, 3072], "mul_1": [1, 832, 3072], "add": [1, 832, 3072], "mul_2": [1, 832, 3072], "tanh": [1, 832, 3072], "add_1": [1, 832, 3072], "hidden_states_1": [1, 832, 3072], "hidden_states_2": [1, 832, 768], "hidden_states_3": [1, 832, 768], "add_2": [1, 832, 768], "hidden_states_4": [1, 832, 768]}}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "28432eb8c22b77d39d8eae55f0796aec"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_stack0_0_ = L_stack0_0_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
hidden_states: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = self.L__self___intermediate_dense(l_stack0_0_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/ in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.5 * hidden_states
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.pow(hidden_states, 3.0)
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.044715 * pow_1; pow_1 = None
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = hidden_states + mul_1; hidden_states = mul_1 = None
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.7978845608028654 * add; add = None
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.tanh(mul_2); mul_2 = None
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 1.0 + tanh; tanh = None
hidden_states_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = mul * add_1; mul = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
hidden_states_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(hidden_states_1); hidden_states_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dropout(hidden_states)
hidden_states_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states_2); hidden_states_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_3 + l_stack0_0_; hidden_states_3 = l_stack0_0_ = None
hidden_states_4: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add_2); add_2 = None
return (hidden_states_4,)
V0627 17:31:10.051000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "c94939d327a02b378b1745a04171ca4e"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[3072, 768][768, 1]cpu", arg1_1: "f32[3072][1]cpu", arg2_1: "f32[768, 3072][3072, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type: "bf16[3072][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[3072, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 3072][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 3072]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/ in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 0.5)
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(view_1, 3.0)
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715); pow_1 = None
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(view_1, mul_1); view_1 = mul_1 = None
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654); add = None
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2); mul_2 = None
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0); tanh = None
mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1); mul = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None
convert_element_type_7: "bf16[768, 3072][3072, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None
view_2: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.view.default(mul_3, [832, 3072]); mul_3 = None
permute_1: "bf16[3072, 768][1, 3072]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_2, permute_1); convert_element_type_6 = view_2 = permute_1 = None
view_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dropout(hidden_states)
clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_3); view_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg6_1); clone = arg6_1 = None
var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3); add_3 = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1); add_2 = getitem_1 = None
mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, arg4_1); mul_4 = arg4_1 = None
add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, arg5_1); mul_5 = arg5_1 = None
return (add_4,)
V0627 17:31:10.133000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "8400618ae53b7968980ef85788f68b83"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# No stacktrace found for following nodes
_frozen_param4: "f32[768][1]cpu" = self._frozen_param4
_frozen_param5: "f32[768][1]cpu" = self._frozen_param5
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
_frozen_param6: "bf16[3072][1]cpu" = self._frozen_param6
# No stacktrace found for following nodes
_frozen_param10: "bf16[3072, 768][1, 0]cpu" = self._frozen_param10
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
# No stacktrace found for following nodes
_frozen_param11: "bf16[768, 3072][1, 0]cpu" = self._frozen_param11
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
_linear_pointwise_default_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param10, _frozen_param6, 'none', [], ''); convert_element_type_2 = _frozen_param10 = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/ in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(_linear_pointwise_default_3, 0.5)
pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(_linear_pointwise_default_3, 3.0)
mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715); pow_1 = None
add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_3, mul_1); _linear_pointwise_default_3 = mul_1 = None
mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654); add = None
tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2); mul_2 = None
add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0); tanh = None
mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1); mul = add_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.dense(hidden_states)
_linear_pointwise_default_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(mul_3, _frozen_param11, _frozen_param8, 'none', [], ''); mul_3 = _frozen_param11 = _frozen_param8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_2, arg6_1); _linear_pointwise_default_2 = arg6_1 = None
var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True)
getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1); add_2 = getitem_1 = None
add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3); add_3 = None
mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, _frozen_param4); mul_4 = _frozen_param4 = None
add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, _frozen_param5); mul_5 = _frozen_param5 = None
return (add_4,)
V0627 17:31:10.240000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/yq/"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "220b8ade00d54ed30a9ebc3492a6ee4d"}
# AOT ID: ['11_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param4 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45300
_frozen_param5 = None # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45350
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (3072,) (1,) 7f2e301a7600
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (3072, 768) (1, 0) 7f2e3013c8b0
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e3013f830
_frozen_param11 = None # device(type='cpu') torch.bfloat16 (768, 3072) (1, 0) 7f2e3013c2c0
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0); + static_cast<long>(x0), 16);
cpp_fused_add_mul_pow_tanh_1 = async_compile.cpp_pybinding(['bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(bfloat16* in_out_ptr0)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2555904L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_out_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(0.5);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 * tmp3;
auto tmp5 = tmp1 * tmp1;
auto tmp6 = tmp5 * tmp1;
auto tmp7 = static_cast<float>(0.044715);
auto tmp8 = at::vec::Vectorized<float>(tmp7);
auto tmp9 = tmp6 * tmp8;
auto tmp10 = tmp1 + tmp9;
auto tmp11 = static_cast<float>(0.7978845608028654);
auto tmp12 = at::vec::Vectorized<float>(tmp11);
auto tmp13 = tmp10 * tmp12;
auto tmp14 = decltype(tmp13)(2) / (decltype(tmp13)(1) + (decltype(tmp13)(-2) * tmp13).exp()) - decltype(tmp13)(1);
auto tmp15 = static_cast<float>(1.0);
auto tmp16 = at::vec::Vectorized<float>(tmp15);
auto tmp17 = tmp14 + tmp16;
auto tmp18 = tmp4 * tmp17;
auto tmp19 = at::vec::convert<bfloat16>(tmp18); + static_cast<long>(x0), 16);
cpp_fused_add_native_layer_norm_2 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const bfloat16* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
const float* in_ptr3,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps);
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
auto tmp4 = out_ptr0[static_cast<long>(x0)];
auto tmp7 = out_ptr1[static_cast<long>(x0)];
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16);
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp3 = tmp1 + tmp2;
auto tmp5 = at::vec::Vectorized<float>(tmp4);
auto tmp6 = tmp3 - tmp5;
auto tmp8 = static_cast<float>(768.0);
auto tmp9 = tmp7 / tmp8;
auto tmp10 = static_cast<float>(1e-12);
auto tmp11 = decltype(tmp9)(tmp9 + tmp10);
auto tmp12 = 1 / std::sqrt(tmp11);
auto tmp13 = at::vec::Vectorized<float>(tmp12);
auto tmp14 = tmp6 * tmp13;
auto tmp16 = tmp14 * tmp15;
auto tmp18 = tmp16 + tmp17; + static_cast<long>(x1 + (768L*x0)));
del async_compile
def call(args):
arg6_1, = args
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg6_1, buf0)
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param10, _frozen_param6, 'none', [-1], '')
del buf0
buf2 = buf1; del buf1 # reuse
buf3 = torch.ops.mkldnn._linear_pointwise(buf2, _frozen_param11, _frozen_param8, 'none', [-1], '')
del buf2
buf4 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf5 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
buf7 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
cpp_fused_add_native_layer_norm_2(buf3, arg6_1, _frozen_param4, _frozen_param5, buf4, buf5, buf7)
del arg6_1
return (buf7, )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param4
_frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param5
_frozen_param5 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
global _frozen_param6
_frozen_param6 = rand_strided((3072, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param10
_frozen_param10 = rand_strided((3072, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param8
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param11
_frozen_param11 = rand_strided((768, 3072), (1, 0), device='cpu', dtype=torch.bfloat16)
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg6_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:10.257000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:10.258000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "f3efa14ea8c088430fc033af17fce04d"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272320)
| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__)
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202267808)
| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202268288)
| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202268912)
| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | | +- GuardManager: source=L['self'].intermediate, accessed_by=DictGetItemGuardAccessor(intermediate)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate, 139839202275440)
| | | | | +- GuardManager: source=L['self'].intermediate.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.__dict__)
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | +- GuardManager: source=L['self'].intermediate._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | | | | +- GuardManager: source=L['self'].intermediate.dense, accessed_by=DictGetItemGuardAccessor(dense)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.dense, 139839202270544)
| | | | | | | | +- GuardManager: source=L['self'].intermediate.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn, accessed_by=DictGetItemGuardAccessor(intermediate_act_fn)
| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.intermediate_act_fn, 139839202267616)
| | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.intermediate_act_fn.__dict__)
| | | | | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
| | | | | | +- GuardManager: source=L['self'].intermediate._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
| | | +- GuardManager: source=L['self'].is_decoder, accessed_by=DictGetItemGuardAccessor(is_decoder)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].is_decoder, 7685824)
| | | +- GuardManager: source=L['self'].seq_len_dim, accessed_by=DictGetItemGuardAccessor(seq_len_dim)
| | | | +- EQUALS_MATCH: L['self'].seq_len_dim == 1
| | | +- GuardManager: source=L['self'].chunk_size_feed_forward, accessed_by=DictGetItemGuardAccessor(chunk_size_feed_forward)
| | | | +- EQUALS_MATCH: L['self'].chunk_size_feed_forward == 0
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
| | +- LENGTH_CHECK: len(L['___stack0']) == 1
| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['apply_chunking_to_forward'], accessed_by=DictGetItemGuardAccessor(apply_chunking_to_forward)
| | | +- GuardManager: source=G['apply_chunking_to_forward'].__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | +- ID_MATCH: ___check_obj_id(G['apply_chunking_to_forward'].__code__, 139839646455872)
| | +- GuardManager: source=G['__builtins_dict___52'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___52)
| | | +- GuardManager: source=G['__builtins_dict___52']['len'], accessed_by=DictGetItemGuardAccessor(len)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___52']['len'], 139845257826832)
| | +- GuardManager: source=G['__import_transformers_dot_activations'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_activations)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'], 139839665031744)
| | | +- GuardManager: source=G['__import_transformers_dot_activations'].math, accessed_by=GetAttrGuardAccessor(math)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math, 139845236089744)
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.pi, accessed_by=GetAttrGuardAccessor(pi)
| | | | | +- EQUALS_MATCH: G['__import_transformers_dot_activations'].math.pi == 3.141592653589793
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math.sqrt, 139845236093344)
| | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.pow, accessed_by=GetAttrGuardAccessor(pow)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.pow, 139845228824512)
| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.tanh, accessed_by=GetAttrGuardAccessor(tanh)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.tanh, 139845228799744)
| | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_pytorch_utils)
| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'], 139839703287984)
| | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect, accessed_by=GetAttrGuardAccessor(inspect)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect, 139845236517488)
| | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature, accessed_by=GetAttrGuardAccessor(signature)
| | | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, accessed_by=GetAttrGuardAccessor(__code__)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, 139845231798640)
| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks
V0627 17:31:10.259000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "18/0", "frame_key": "23", "co_name": "torch_dynamo_resume_in_forward_at_1488", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1488, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 69, "shape_env_guard_count": 0, "graph_op_count": 13, "graph_node_count": 15, "graph_input_count": 1, "start_time": 1719534669.9335542, "entire_frame_compile_time_s": 0.3254525661468506, "backend_compile_time_s": 0.26067519187927246, "inductor_compile_time_s": 0.1273505687713623, "code_gen_time_s": 0.07860469818115234, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:10.262000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 46, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 46, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 46, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 46, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 46, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 46, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.275000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.275000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 46, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.306000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 47, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.307000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.307000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 47, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 47, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 47, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 47, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 47, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.310000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.311000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 47, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.312000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.312000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 47, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.317000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:10.317000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "9d228664307649151c1145ad228290a7"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274768)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202265168)
| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value)
| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800)
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask)
| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)
V0627 17:31:10.318000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "7/1", "frame_key": "24", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534670.2629929, "entire_frame_compile_time_s": 0.05506253242492676, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.04132270812988281, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.318000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 48, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 48, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 48, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 48, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 48, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 48, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.325000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.325000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 48, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 49, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 49, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 49, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 49, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.357000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 49, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.357000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.358000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.358000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 49, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.360000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.360000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 49, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.362000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.362000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 49, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.365000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"}
class GraphModule(torch.nn.Module):
def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
l_to_mask_ = L_to_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: band_mask =
band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" =; l_band_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: from_mask =
from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" =; l_from_mask_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: to_mask =
to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" =; l_to_mask_ = None
return (band_mask, from_mask, to_mask)
V0627 17:31:10.380000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
return (arg0_1, arg1_1, arg2_1)
V0627 17:31:10.392000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:10.392000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "78c6200e495d09cd995b82c1e530d62e"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202265168)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202264976)
| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:10.392000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "8/1", "frame_key": "25", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534670.3189635, "entire_frame_compile_time_s": 0.07366013526916504, "backend_compile_time_s": 0.02211451530456543, "inductor_compile_time_s": 0.00025773048400878906, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03475379943847656, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.393000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 51, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 51}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 51, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 52, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 52, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 7, "describer_id": 52, "size": 442368}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 52, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 8, "describer_id": 52, "size": 3328}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 52, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.437000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.437000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 52, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.438000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.438000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 52, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.441000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"}
class GraphModule(torch.nn.Module):
def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
l_hidden_states_ = L_hidden_states_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64); l__self___query = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3); x = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64); l__self___key = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3); x_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_); l_hidden_states_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64); l__self___value = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3); x_2 = None
return (query_layer, key_layer, value_layer)
V0627 17:31:10.491000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16); arg1_1 = None
convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16); arg0_1 = None
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]); convert_element_type_2 = None
permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]); convert_element_type_1 = None
addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute); convert_element_type = view = permute = None
view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]); addmm = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]); view_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16); arg3_1 = None
convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16); arg2_1 = None
convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]); convert_element_type_8 = None
permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]); convert_element_type_7 = None
addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2); convert_element_type_6 = view_3 = permute_2 = None
view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]); addmm_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]); view_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16); arg5_1 = None
convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16); arg4_1 = None
convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]); convert_element_type_14 = None
permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]); convert_element_type_13 = None
addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4); convert_element_type_12 = view_6 = permute_4 = None
view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]); addmm_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]); view_7 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:10.557000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"}
class <lambda>(torch.nn.Module):
def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
_frozen_param6: "bf16[768][1]cpu" = self._frozen_param6
# No stacktrace found for following nodes
_frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
# No stacktrace found for following nodes
_frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_frozen_param10: "bf16[768][1]cpu" = self._frozen_param10
# No stacktrace found for following nodes
_frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
_linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
_linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
_linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: x = x.view(*new_x_shape)
view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
return (permute_1, permute_3, permute_5)
V0627 17:31:10.578000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/wm/"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "c068758cb8977ae26fcf611c09070a9a"}
# AOT ID: ['13_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
_frozen_param6 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300d5490
_frozen_param12 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300daed0
_frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a0c70
_frozen_param13 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300a3e70
_frozen_param10 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a1490
_frozen_param14 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300dbfb0
cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
extern "C" void kernel(const float* in_ptr0,
bfloat16* out_ptr0)
#pragma omp parallel num_threads(56)
int tid = omp_get_thread_num();
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
auto tmp1 = at::vec::convert<bfloat16>(tmp0); + static_cast<long>(x0), 16);
del async_compile
def call(args):
arg6_1, = args
assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
cpp_fused__to_copy_0(arg6_1, buf0)
del arg6_1
buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _frozen_param6
_frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param12
_frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param8
_frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param13
_frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
global _frozen_param10
_frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
global _frozen_param14
_frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
fn = lambda: call([arg6_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:10.587000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:10.588000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "da04fa8fdd18f2f15ae08b9dbbb492e0"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202265648)
| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202265696)
| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value)
| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202264592)
| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor
| | | | | | +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(training)
| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'], 7685824)
| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed)
| | | | +- EQUALS_MATCH: L['self'].seed == 1
| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
| | | | +- EQUALS_MATCH: L['self'].block_size == 64
| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks)
| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3
| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64
| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads)
| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12
| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
V0627 17:31:10.588000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "9/1", "frame_key": "26", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534670.3936255, "entire_frame_compile_time_s": 0.19471240043640137, "backend_compile_time_s": 0.1402432918548584, "inductor_compile_time_s": 0.033010005950927734, "code_gen_time_s": 0.012862920761108398, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.024178743362426758, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.589000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.614000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:10.614000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "1ea07e64f0c0d490d94336fa323c05e9"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed)
| | +- EQUALS_MATCH: L['seed'] == 1
| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
| | +- EQUALS_MATCH: L['batch_size'] == 1
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size)
| | +- EQUALS_MATCH: L['attention_head_size'] == 64
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random)
| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464)
| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed)
| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264)
| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math)
| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744)
| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344)
V0627 17:31:10.615000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "10/1", "frame_key": "27", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 516, "cache_size": 1, "accumulated_cache_size": 1, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.5898829, "entire_frame_compile_time_s": 0.02506852149963379, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.009800434112548828, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.615000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.647000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:10.648000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "b6b8c289bd494c29f862b3959f02ec26"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
| | +- EQUALS_MATCH: L['n_heads'] == 12
| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
| | +- EQUALS_MATCH: L['to_seq_len'] == 832
| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
| | +- EQUALS_MATCH: L['from_seq_len'] == 832
| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
| | +- EQUALS_MATCH: L['to_block_size'] == 64
| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
| | +- EQUALS_MATCH: L['from_block_size'] == 64
| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800)
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___69'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___69)
| | | +- GuardManager: source=G['__builtins_dict___69']['int'], accessed_by=DictGetItemGuardAccessor(int)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___69']['int'], 7648640)
V0627 17:31:10.648000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "11/1", "frame_key": "28", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.6159284, "entire_frame_compile_time_s": 0.03219175338745117, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.01743292808532715, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.649000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300921b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30091ee0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30092c00>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 3, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30152570>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 4, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0040>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 5, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0950>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 6, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0680>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 7, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1530>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 8, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1e90>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 9, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc2840>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.673000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 10, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.673000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc30b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.674000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.675000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 11, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.676000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3470>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.676000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 58, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.681000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None))); l_stack0_0_ = None
wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None))); l_stack0_1_ = None
wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None))); l_stack0_2_ = None
wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None))); l_stack0_3_ = None
wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None))); l_stack0_4_ = None
wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None))); l_stack0_5_ = None
wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None))); l_stack0_6_ = None
wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None))); l_stack0_7_ = None
wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None))); l_stack0_8_ = None
wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None))); l_stack0_9_ = None
wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None))); l_stack0_10_ = None
wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None))); l_stack0_11_ = None
return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11)
V0627 17:31:10.751000 139845268738432 torch/_functorch/_aot_autograd/] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807); slice_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807); slice_3 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807); slice_5 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807); slice_7 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807); slice_9 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807); slice_11 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807); slice_13 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807); slice_15 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807); slice_17 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807); slice_19 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807); slice_21 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807); slice_23 = None
return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24)
V0627 17:31:10.788000 139845268738432 torch/_inductor/] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"}
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0627 17:31:10.802000 139845268738432 torch/_inductor/] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/of/"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "c06d796ae11c7e77048735efc71e26ca"}
# AOT ID: ['14_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
alloc_from_pool = torch.ops.inductor._alloc_from_pool
reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
async_compile = AsyncCompile()
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
assert_size_stride(arg0_1, (13, 3), (3, 1))
assert_size_stride(arg1_1, (13, 3), (3, 1))
assert_size_stride(arg2_1, (13, 3), (3, 1))
assert_size_stride(arg3_1, (13, 3), (3, 1))
assert_size_stride(arg4_1, (13, 3), (3, 1))
assert_size_stride(arg5_1, (13, 3), (3, 1))
assert_size_stride(arg6_1, (13, 3), (3, 1))
assert_size_stride(arg7_1, (13, 3), (3, 1))
assert_size_stride(arg8_1, (13, 3), (3, 1))
assert_size_stride(arg9_1, (13, 3), (3, 1))
assert_size_stride(arg10_1, (13, 3), (3, 1))
assert_size_stride(arg11_1, (13, 3), (3, 1))
return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
return print_performance(fn, times=times, repeat=repeat)
if __name__ == "__main__":
from torch._inductor.wrapper_benchmark import compiled_module_main
compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0627 17:31:10.810000 139845268738432 torch/_dynamo/] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
V0627 17:31:10.810000 139845268738432 torch/_dynamo/] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "6e13f24b700fd79116617b1177bb6706"}
+- RootGuardManager
| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None # _dynamo/ in init_ambient_guards
| +- GLOBAL_STATE: ___check_global_state()
| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
| | +- LENGTH_CHECK: len(L['___stack0']) == 12
| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
| | +- EQUALS_MATCH: L['num_heads'] == 12
| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks)
| | +- EQUALS_MATCH: L['num_blocks'] == 13
| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top)
| | +- EQUALS_MATCH: L['global_block_top'] == 1
| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom)
| | +- EQUALS_MATCH: L['global_block_bottom'] == 1
| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
| | +- GuardManager: source=G['__builtins_dict___71'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___71)
| | | +- GuardManager: source=G['__builtins_dict___71']['range'], accessed_by=DictGetItemGuardAccessor(range)
| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___71']['range'], 7632448)
V0627 17:31:10.810000 139845268738432 torch/_dynamo/] {"compilation_metrics": {"compile_id": "14/1", "frame_key": "29", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534670.6493185, "entire_frame_compile_time_s": 0.16145634651184082, "backend_compile_time_s": 0.12227082252502441, "inductor_compile_time_s": 0.022518634796142578, "code_gen_time_s": 0.0035479068756103516, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.811000 139845268738432 torch/_dynamo/] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 0, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff64360>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 1, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5e020>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 2, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5f5b0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 3, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300934c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 4, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff46cf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 5, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff44d60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 6, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffb1a80>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 7, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300dbbf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 8, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30090bd0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 9, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff67920>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 10, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3010>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 11, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc20c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 12, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 12, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014c20>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 12, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.833000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 13, "describer_id": 60, "size": 3328}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 14, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 13, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 13, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 14, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 15, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014220>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 15, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.841000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 15, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.842000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 16, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30015d50>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.842000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 16, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.856000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 17, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.856000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 17, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/] {"describe_storage": {"id": 16, "describer_id": 60, "size": 442368}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 18, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 18, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.936000 139845268738432 torch/_subclasses/] {"describe_tensor": {"id": 19, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.936000 139845268738432 torch/_subclasses/] {"describe_source": {"describer_id": 60, "id": 19, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.952000 139845268738432 torch/_dynamo/] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"}
class GraphModule(torch.nn.Module):
def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
l_stack0_0_ = L_stack0_0_
l_stack0_1_ = L_stack0_1_
l_stack0_2_ = L_stack0_2_
l_stack0_3_ = L_stack0_3_
l_stack0_4_ = L_stack0_4_
l_stack0_5_ = L_stack0_5_
l_stack0_6_ = L_stack0_6_
l_stack0_7_ = L_stack0_7_
l_stack0_8_ = L_stack0_8_
l_stack0_9_ = L_stack0_9_
l_stack0_10_ = L_stack0_10_
l_stack0_11_ = L_stack0_11_
l_query_layer_ = L_query_layer_
l_from_blocked_mask_ = L_from_blocked_mask_
l_key_layer_ = L_key_layer_
l_value_layer_ = L_value_layer_
l_to_mask_ = L_to_mask_
l_band_mask_ = L_band_mask_
l_from_mask_ = L_from_mask_
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0); l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64); rand_attn = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0)
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn =[rand_attn for _ in range(batch_size)], dim=0)
rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" =[rand_attn_1], dim = 0); rand_attn_1 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0]
i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0]
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
flatten: "i64[396][1]cpu" = i1.flatten(); i1 = None
getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten]; p1 = flatten = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]); getitem_2 = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/ in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192); rand_mask = None
# File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-pa
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment