leslie-fang-intel/after regression.log

## after regression.log
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/torchbench.py", 0]}
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/common.py", 1]}
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/eval_frame.py", 2]}
V0627 17:31:00.663000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/_dynamo/convert_frame.py", 3]}
V0627 17:31:00.663000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.691000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 0, "size": 6552}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.692000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.692000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 0, "id": 0, "source": "L['inputs'][0]"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 0, "size": 32768}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 0}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.701000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 0, "id": 1, "source": "L['mod'].bert.embeddings.token_type_ids"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:00.718000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "8f3f91fb1d48d67b1336de49ea694c74"}
	class GraphModule(torch.nn.Module):
	    def forward(self):
	        # No stacktrace found for following nodes
	        _enter_autocast = torch.amp.autocast_mode._enter_autocast('cpu', None, True, None)
	        _exit_autocast = torch.amp.autocast_mode._exit_autocast(_enter_autocast);  _enter_autocast = None
	        return ()

V0627 17:31:01.398000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "845c30ca0008a08ec62276cecc47183b"}
	class <lambda>(torch.nn.Module):
	    def forward(self):
	        return ()

V0627 17:31:01.498000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:01.498000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "40c07a4da7b433b5416cc93985646719"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['mod'], accessed_by=DictGetItemGuardAccessor(mod)
	| | +- ID_MATCH: ___check_obj_id(L['mod'], 139839714901824)
	| | +- GuardManager: source=L['mod'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['mod'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['mod'].training, 7685824)
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- TYPE_MATCH: ___check_type_id(L['self'], 139842378438672)
	| | +- GuardManager: source=L['self'].autocast, accessed_by=GetAttrGuardAccessor(autocast)
	| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast, 139845255007760)
	| | | +- GuardManager: source=L['self'].autocast.args, accessed_by=GetAttrGuardAccessor(args)
	| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.args, 7625984)
	| | | | +- LENGTH_CHECK: not L['self'].autocast.args
	| | | +- GuardManager: source=L['self'].autocast.func, accessed_by=GetAttrGuardAccessor(func)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].autocast.func, 139844826956816)
	| | | +- GuardManager: source=L['self'].autocast.keywords, accessed_by=GetAttrGuardAccessor(keywords)
	| | | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast.keywords, 7646656)
	| | | | +- GuardManager: source=L['self'].autocast.keywords['device_type'], accessed_by=DictGetItemGuardAccessor(device_type)
	| | | | | +- EQUALS_MATCH: L['self'].autocast.keywords['device_type'] == 'cpu'
	| | +- GuardManager: source=L['self'].autocast_arg, accessed_by=GetAttrGuardAccessor(autocast_arg)
	| | | +- TYPE_MATCH: ___check_type_id(L['self'].autocast_arg, 7646656)
	| | | +- DICT_LENGTH: not L['self'].autocast_arg
	| +- GuardManager: source=L['inputs'], accessed_by=DictGetItemGuardAccessor(inputs)
	| | +- TYPE_MATCH: ___check_type_id(L['inputs'], 7625984)
	| | +- LENGTH_CHECK: len(L['inputs']) == 1
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['__builtins_dict___1'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___1)
	| | | +- GuardManager: source=G['__builtins_dict___1']['dict'], accessed_by=DictGetItemGuardAccessor(dict)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['dict'], 7646656)
	| | | +- GuardManager: source=G['__builtins_dict___1']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___1']['isinstance'], 139845257826512)

V0627 17:31:01.498000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "forward_pass", "co_filename": "/localdisk/leslie/torch_inductor_community/pytorch/benchmarks/dynamo/torchbench.py", "co_firstlineno": 425, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534660.6636841, "entire_frame_compile_time_s": 0.8347411155700684, "backend_compile_time_s": 0.7748816013336182, "inductor_compile_time_s": 0.00018596649169921875, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.04396843910217285, "has_guarded_code": true}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.500000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/torch_inductor_community/pytorch/torch/nn/modules/module.py", 4]}
V0627 17:31:01.500000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.513000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 6, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.513000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.514000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 6, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 6, "size": 32768}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 6}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.519000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 6, "id": 1, "source": "L['self'].bert.embeddings.token_type_ids"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 7, "size": 6552}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 7}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.530000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 7, "id": 0, "source": "L['input_ids']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.535000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:01.535000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "6017f86a7c776c49ca1dd7d3539605bb"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901824)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].bert, accessed_by=DictGetItemGuardAccessor(bert)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].bert, 139839714901584)
	| | | | | +- GuardManager: source=L['self'].bert.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].bert.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].bert.training, 7685824)
	| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
	| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
	| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
	| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
	| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
	| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
	| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
	| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
	| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7636800)
	| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
	| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
	| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
	| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
	| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
	| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
	| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
	| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800)
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800)
	| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800)
	| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
	| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)

V0627 17:31:01.535000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "1/0", "frame_key": "6", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2382, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534661.5002189, "entire_frame_compile_time_s": 0.03560638427734375, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.022696733474731445, "has_guarded_code": true}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.536000 139845268738432 torch/_logging/structured.py:19] {"str": ["/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", 5]}
V0627 17:31:01.536000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 8, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.542000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 8, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 8, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 8}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.548000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 8, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 9, "size": 6552}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.557000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 9, "id": 0, "source": "L['input_ids']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.562000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 9, "size": 32768}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.563000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 9}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.563000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 9, "id": 1, "source": "L['self'].embeddings.token_type_ids"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:01.566000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"attention_mask": [1, 819], "l__self___embeddings_token_type_ids": [1, 4096], "buffered_token_type_ids": [1, 819], "buffered_token_type_ids_expanded": [1, 819]}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "6b7bec0701d22225fb67e6f1bfb9dc36"}
	class GraphModule(torch.nn.Module):
	    def forward(self):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
	        attention_mask: "f32[1, 819][819, 1]cpu" = torch.ones((1, 819), device = device(type='cpu'))

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
	        l__self___embeddings_token_type_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_token_type_ids
	        buffered_token_type_ids: "i64[1, 819][4096, 1]cpu" = l__self___embeddings_token_type_ids[(slice(None, None, None), slice(None, 819, None))];  l__self___embeddings_token_type_ids = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
	        buffered_token_type_ids_expanded: "i64[1, 819][4096, 1]cpu" = buffered_token_type_ids.expand(1, 819);  buffered_token_type_ids = None
	        return (attention_mask, buffered_token_type_ids_expanded)

V0627 17:31:01.581000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "1b3fb2899c356f991117f2262727f0ef"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
	        full: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
	        slice_1: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 0, 9223372036854775807);  arg0_1 = None
	        slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 819);  slice_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
	        expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]);  slice_2 = None
	        return (full, expand)

V0627 17:31:01.707000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "4a69dc4d0dfb43287c6abf210e06617e"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
	        full_default: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
	        slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819);  arg0_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
	        expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]);  slice_2 = None
	        return (full_default, expand)

V0627 17:31:02.787000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ko/ckovbmwilyxv4sl5d74oe6jsmj25rub5gzv5kco7u66lavi64ivy.py"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0244b4281966e5f52ba168279eb45118"}

	# AOT ID: ['1_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()


	cpp_fused_ones_0 = async_compile.cpp_pybinding(['float*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(float* out_ptr0)
	{
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(816L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = static_cast<float>(1.0);
	            auto tmp1 = at::vec::Vectorized<float>(tmp0);
	            tmp1.store(out_ptr0 + static_cast<long>(x0));
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(816L); x0<static_cast<long>(819L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = static_cast<float>(1.0);
	            out_ptr0[static_cast<long>(x0)] = tmp0;
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg0_1, = args
	    args.clear()
	    assert_size_stride(arg0_1, (1, 4096), (4096, 1))
	    buf0 = empty_strided_cpu((1, 819), (819, 1), torch.float32)
	    cpp_fused_ones_0(buf0)
	    return (buf0, reinterpret_tensor(arg0_1, (1, 819), (4096, 1), 0), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    arg0_1 = rand_strided((1, 4096), (4096, 1), device='cpu', dtype=torch.int64)
	    fn = lambda: call([arg0_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:02.814000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:02.815000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "0b1c2f71c2e67149726041714c77db6e"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
	| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208)
	| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers)
	| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_ids, accessed_by=DictGetItemGuardAccessor(token_type_ids)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_ids, 139838528701520)
	| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
	| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
	| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
	| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
	| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
	| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
	| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856)
	| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
	| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
	| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
	| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
	| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
	| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
	| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
	| | +- ID_MATCH: ___check_obj_id(L['token_type_ids'], 7636800)
	| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values)
	| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800)
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7636800)
	| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7636800)
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
	| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
	| | | +- GuardManager: source=G['torch'].ones, accessed_by=GetAttrGuardAccessor(ones)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].ones, 139845228734288)
	| | +- GuardManager: source=G['__import_torch'], accessed_by=DictGetItemGuardAccessor(__import_torch)
	| | | +- GuardManager: source=G['__import_torch'].fx, accessed_by=GetAttrGuardAccessor(fx)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx, 139842407409488)
	| | | | +- GuardManager: source=G['__import_torch'].fx.Proxy, accessed_by=GetAttrGuardAccessor(Proxy)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch'].fx.Proxy, 139842429035536)
	| | | +- GuardManager: source=G['__import_torch']._dynamo, accessed_by=GetAttrGuardAccessor(_dynamo)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo, 139839776121264)
	| | | | +- GuardManager: source=G['__import_torch']._dynamo.is_compiling, accessed_by=GetAttrGuardAccessor(is_compiling)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch']._dynamo.is_compiling, 139839726529856)
	| | +- GuardManager: source=G['__builtins_dict___9'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___9)
	| | | +- GuardManager: source=G['__builtins_dict___9']['hasattr'], accessed_by=DictGetItemGuardAccessor(hasattr)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['hasattr'], 139845257826112)
	| | | +- GuardManager: source=G['__builtins_dict___9']['isinstance'], accessed_by=DictGetItemGuardAccessor(isinstance)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___9']['isinstance'], 139845257826512)
	| | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_modeling_utils)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'], 139839661201088)
	| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch, accessed_by=GetAttrGuardAccessor(torch)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch, 139845236322800)
	| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit, accessed_by=GetAttrGuardAccessor(jit)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit, 139842414949968)
	| | | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, accessed_by=GetAttrGuardAccessor(is_tracing)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].torch.jit.is_tracing, 139842413687088)
	| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy, accessed_by=GetAttrGuardAccessor(is_torch_fx_proxy)
	| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, accessed_by=GetAttrGuardAccessor(__code__)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torch_fx_proxy.__code__, 139839683265264)
	| | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling, accessed_by=GetAttrGuardAccessor(is_torchdynamo_compiling)
	| | | | +- GuardManager: source=G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, accessed_by=GetAttrGuardAccessor(__code__)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_modeling_utils'].is_torchdynamo_compiling.__code__, 139839683236192)
	| | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_utils_dot_import_utils)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'], 139839683217824)
	| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_available, accessed_by=GetAttrGuardAccessor(_torch_available)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_available, 7685856)
	| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available, accessed_by=GetAttrGuardAccessor(is_torch_available)
	| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, accessed_by=GetAttrGuardAccessor(__code__)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_available.__code__, 139839683197424)
	| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, accessed_by=GetAttrGuardAccessor(_torch_fx_available)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils']._torch_fx_available, 7685856)
	| | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available, accessed_by=GetAttrGuardAccessor(is_torch_fx_available)
	| | | | +- GuardManager: source=G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, accessed_by=GetAttrGuardAccessor(__code__)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_utils_dot_import_utils'].is_torch_fx_available.__code__, 139839683233376)

V0627 17:31:02.815000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "2/0", "frame_key": "7", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1970, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 39, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 5, "graph_input_count": 0, "start_time": 1719534661.536575, "entire_frame_compile_time_s": 1.2790420055389404, "backend_compile_time_s": 1.2300312519073486, "inductor_compile_time_s": 1.2066993713378906, "code_gen_time_s": 1.083174467086792, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.014907121658325195, "has_guarded_code": true}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.816000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 12, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 12}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.823000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 12, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 13, "size": 6552}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 13}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.830000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 13, "id": 0, "source": "L['input_ids']"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.837000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:02.837000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1, "has_payload": "fce4fac5f9230c475246dd6dd52e1c05"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
	| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
	| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
	| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
	| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['logger'], accessed_by=DictGetItemGuardAccessor(logger)
	| | | +- ID_MATCH: ___check_obj_id(G['logger'], 139839664782448)

V0627 17:31:02.837000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "3/0", "frame_key": "8", "co_name": "_pad_to_block_size", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2208, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 9, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 1, "graph_input_count": 1, "start_time": 1719534662.816827, "entire_frame_compile_time_s": 0.0205228328704834, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Logger not supported for non-export cases"], "dynamo_time_before_restart_s": 0.007956266403198242, "has_guarded_code": true}, "frame_id": 3, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.838000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2226, "name": "_pad_to_block_size", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 14, "size": 6552}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ef4724e50>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.839000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 0, "source": "L['input_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.843000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 14, "size": 3276}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.844000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "stride": [819, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31f99710>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.844000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 2, "source": "L['attention_mask']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 14, "size": 32768}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44450>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 819], "is_leaf": true, "is_view": true, "stride": [4096, 1], "storage": 2, "base": 5, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ed9f15fd0>", "describer_id": 14}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.846000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 14, "id": 4, "source": "L['token_type_ids']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.850000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_input_ids_": [1, 819], "l_attention_mask_": [1, 819], "l_token_type_ids_": [1, 819], "input_ids": [1, 832], "attention_mask": [1, 832], "token_type_ids": [1, 832]}}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "e7d86ff372082e962b35557ebd7308fc"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_input_ids_: "i64[1, 819][819, 1]cpu", L_attention_mask_: "f32[1, 819][819, 1]cpu", L_token_type_ids_: "i64[1, 819][4096, 1]cpu"):
	        l_input_ids_ = L_input_ids_
	        l_attention_mask_ = L_attention_mask_
	        l_token_type_ids_ = L_token_type_ids_

	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        input_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_input_ids_, (0, 13), 'constant', 0);  l_input_ids_ = None

	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        attention_mask: "f32[1, 832][832, 1]cpu" = torch._C._nn.pad(l_attention_mask_, (0, 13), 'constant', False);  l_attention_mask_ = None

	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        token_type_ids: "i64[1, 832][832, 1]cpu" = torch._C._nn.pad(l_token_type_ids_, (0, 13), 'constant', 0);  l_token_type_ids_ = None
	        return (input_ids, attention_mask, token_type_ids)

V0627 17:31:02.865000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0);  arg0_1 = None

	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0);  arg1_1 = None

	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0);  arg2_1 = None
	        return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)

V0627 17:31:02.875000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "c1bca16543cf877223d6615932016518"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0);  arg0_1 = None

	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0);  arg1_1 = None

	        # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
	        constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0);  arg2_1 = None
	        return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)

V0627 17:31:02.904000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/fj/cfjbx53t7urpjz2ng6piwvoj4vsjiqa2xrgjcrh4vovq4w45eywv.py"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "3758e3875a0e606fcec57aeffa852874"}

	# AOT ID: ['2_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()


	cpp_fused_constant_pad_nd_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'int64_t*', 'float*', 'int64_t*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const int64_t* in_ptr0,
	                       const float* in_ptr1,
	                       const int64_t* in_ptr2,
	                       int64_t* out_ptr0,
	                       float* out_ptr1,
	                       int64_t* out_ptr2)
	{
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = x0;
	            auto tmp1 = c10::convert<int32_t>(tmp0);
	            auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
	            auto tmp3 = static_cast<int32_t>(819);
	            auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
	            auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
	            auto tmp6 = [&]
	            {
	                auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr0 + static_cast<long>(x0));
	                return tmp7;
	            }
	            ;
	            auto tmp10 =
	            [&]
	            {
	                if (tmp5.all_zero())
	                {
	                    return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
	                }
	                else
	                {
	                    auto tmp8 = tmp6();
	                    auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
	                    return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
	                }
	            }
	            ()
	            ;
	            tmp10.store(out_ptr0 + static_cast<long>(x0), 16);
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = x0;
	            auto tmp1 = c10::convert<int32_t>(tmp0);
	            auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
	            auto tmp3 = static_cast<int32_t>(819);
	            auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
	            auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
	            auto tmp6 = [&]
	            {
	                auto tmp7 = tmp5.template cast<float,1>().template loadu<float,1>(in_ptr1 + static_cast<long>(x0));
	                return tmp7;
	            }
	            ;
	            auto tmp10 =
	            [&]
	            {
	                if (tmp5.all_zero())
	                {
	                    return at::vec::Vectorized<float>(static_cast<float>(0.0));
	                }
	                else
	                {
	                    auto tmp8 = tmp6();
	                    auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(0.0));
	                    return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<float,1>());
	                }
	            }
	            ()
	            ;
	            tmp10.store(out_ptr1 + static_cast<long>(x0));
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = x0;
	            auto tmp1 = c10::convert<int32_t>(tmp0);
	            auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
	            auto tmp3 = static_cast<int32_t>(819);
	            auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
	            auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
	            auto tmp6 = [&]
	            {
	                auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr2 + static_cast<long>(x0));
	                return tmp7;
	            }
	            ;
	            auto tmp10 =
	            [&]
	            {
	                if (tmp5.all_zero())
	                {
	                    return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
	                }
	                else
	                {
	                    auto tmp8 = tmp6();
	                    auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
	                    return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
	                }
	            }
	            ()
	            ;
	            tmp10.store(out_ptr2 + static_cast<long>(x0), 16);
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg0_1, arg1_1, arg2_1 = args
	    args.clear()
	    assert_size_stride(arg0_1, (1, 819), (819, 1))
	    assert_size_stride(arg1_1, (1, 819), (819, 1))
	    assert_size_stride(arg2_1, (1, 819), (4096, 1))
	    buf0 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
	    buf1 = empty_strided_cpu((1, 832), (832, 1), torch.float32)
	    buf2 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
	    cpp_fused_constant_pad_nd_0(arg0_1, arg1_1, arg2_1, buf0, buf1, buf2)
	    del arg0_1
	    del arg1_1
	    del arg2_1
	    return (buf0, buf1, buf2, )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    arg0_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.int64)
	    arg1_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.float32)
	    arg2_1 = rand_strided((1, 819), (4096, 1), device='cpu', dtype=torch.int64)
	    fn = lambda: call([arg0_1, arg1_1, arg2_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:02.910000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:02.911000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "ae71e2a61c1f7e9b1434b71d14d096e3"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['input_ids'], accessed_by=DictGetItemGuardAccessor(input_ids)
	| | +- TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
	| | +- NO_HASATTR: hasattr(L['input_ids'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
	| +- GuardManager: source=L['padding_len'], accessed_by=DictGetItemGuardAccessor(padding_len)
	| | +- EQUALS_MATCH: L['padding_len'] == 13
	| +- GuardManager: source=L['pad_token_id'], accessed_by=DictGetItemGuardAccessor(pad_token_id)
	| | +- EQUALS_MATCH: L['pad_token_id'] == 0
	| +- GuardManager: source=L['position_ids'], accessed_by=DictGetItemGuardAccessor(position_ids)
	| | +- ID_MATCH: ___check_obj_id(L['position_ids'], 7636800)
	| +- GuardManager: source=L['inputs_embeds'], accessed_by=DictGetItemGuardAccessor(inputs_embeds)
	| | +- ID_MATCH: ___check_obj_id(L['inputs_embeds'], 7636800)
	| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
	| | +- TENSOR_MATCH: check_tensor(L['attention_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 819], stride=[819, 1])
	| | +- NO_HASATTR: hasattr(L['attention_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
	| +- GuardManager: source=L['token_type_ids'], accessed_by=DictGetItemGuardAccessor(token_type_ids)
	| | +- TENSOR_MATCH: check_tensor(L['token_type_ids'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 819], stride=[4096, 1])
	| | +- NO_HASATTR: hasattr(L['token_type_ids'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['input_ids'], L['attention_mask'], L['token_type_ids'])
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn)
	| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680)
	| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional)
	| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024)
	| | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=GetAttrGuardAccessor(pad)
	| | | | | +- GuardManager: source=G['nn'].functional.pad.__code__, accessed_by=GetAttrGuardAccessor(__code__)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.pad.__code__, 139842439629440)
	| | | | | +- GuardManager: source=G['nn'].functional.pad, accessed_by=FuncDefaultsGuardAccessor
	| | | | | | +- GuardManager: source=G['nn'].functional.pad.__defaults__[0], accessed_by=GetItemGuardAccessor(0)
	| | | | | | | +- EQUALS_MATCH: G['nn'].functional.pad.__defaults__[0] == 'constant'
	| | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_functional)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'], 139842441627024)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch, accessed_by=GetAttrGuardAccessor(torch)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch, 139845236322800)
	| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C, 139845228547104)
	| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn, accessed_by=GetAttrGuardAccessor(_nn)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn, 139842445377216)
	| | | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, accessed_by=GetAttrGuardAccessor(pad)
	| | | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch._C._nn.pad, 139842445416928)
	| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit, accessed_by=GetAttrGuardAccessor(jit)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit, 139842414949968)
	| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, accessed_by=GetAttrGuardAccessor(is_scripting)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.jit.is_scripting, 139842422983696)
	| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, accessed_by=GetAttrGuardAccessor(are_deterministic_algorithms_enabled)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].torch.are_deterministic_algorithms_enabled, 139842451619504)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, accessed_by=GetAttrGuardAccessor(has_torch_function_unary)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_functional'].has_torch_function_unary, 139845228559104)

V0627 17:31:02.911000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "4/0", "frame_key": "9", "co_name": "torch_dynamo_resume_in__pad_to_block_size_at_2226", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2226, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 26, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534662.838091, "entire_frame_compile_time_s": 0.07323813438415527, "backend_compile_time_s": 0.05719876289367676, "inductor_compile_time_s": 0.03380870819091797, "code_gen_time_s": 0.027545690536499023, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.912000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.914000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 16, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.915000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 16, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.916000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.916000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 16, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 16}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.936000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 16, "id": 16, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:02.997000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31b97f10>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 0, "source": "L['___stack0'][1]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 17, "size": 3328}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.998000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 1, "source": "L['___stack0'][2]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 17, "size": 6656}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2ea81b39c0>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:02.999000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 2, "source": "L['___stack0'][3]"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 17, "size": 32768}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int64", "device": "device(type='cpu')", "size": [1, 4096], "is_leaf": true, "stride": [4096, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2eb1d44270>", "describer_id": 17}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.014000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 17, "id": 3, "source": "L['self'].embeddings.position_ids"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.027000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_1_": [1, 832], "l_stack0_2_": [1, 832], "l_stack0_3_": [1, 832], "blocked_encoder_mask": [1, 13, 64], "getitem": [1, 9, 64], "getitem_1": [1, 9, 64], "getitem_2": [1, 9, 64], "exp_blocked_to_pad": [1, 9, 192], "getitem_3": [1, 9, 64], "band_mask": [1, 1, 9, 64, 192], "unsqueeze_": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832], "l__self___embeddings_position_ids": [1, 4096], "position_ids": [1, 832], "inputs_embeds": [1, 832, 768], "token_type_embeddings": [1, 832, 768], "embeddings": [1, 832, 768], "position_embeddings": [1, 832, 768], "embeddings_1": [1, 832, 768], "embeddings_2": [1, 832, 768], "embeddings_3": [1, 832, 768]}}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "5bf8fff16cea4127a0a6b6a6800ef31a"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_1_: "i64[1, 832][832, 1]cpu", L_stack0_2_: "f32[1, 832][832, 1]cpu", L_stack0_3_: "i64[1, 832][832, 1]cpu"):
	        l_stack0_1_ = L_stack0_1_
	        l_stack0_2_ = L_stack0_2_
	        l_stack0_3_ = L_stack0_3_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
	        blocked_encoder_mask: "f32[1, 13, 64][832, 64, 1]cpu" = l_stack0_2_.view(1, 13, 64)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
	        getitem: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(1, -3, None))]
	        getitem_1: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))]
	        getitem_2: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(3, -1, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
	        exp_blocked_to_pad: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.cat([getitem, getitem_1, getitem_2], dim = 2);  getitem = getitem_1 = getitem_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
	        getitem_3: "f32[1, 9, 64][832, 64, 1]cpu" = blocked_encoder_mask[(slice(None, None, None), slice(2, -2, None))]
	        band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.functional.einsum('blq,blk->blqk', getitem_3, exp_blocked_to_pad);  getitem_3 = exp_blocked_to_pad = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
	        unsqueeze_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = band_mask.unsqueeze_(1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
	        from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_stack0_2_.view(1, 1, 832, 1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
	        to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_stack0_2_.view(1, 1, 1, 832);  l_stack0_2_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:282 in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
	        l__self___embeddings_position_ids: "i64[1, 4096][4096, 1]cpu" = self.L__self___embeddings_position_ids
	        position_ids: "i64[1, 832][4096, 1]cpu" = l__self___embeddings_position_ids[(slice(None, None, None), slice(0, 832, None))];  l__self___embeddings_position_ids = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
	        inputs_embeds: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_word_embeddings(l_stack0_1_);  l_stack0_1_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
	        token_type_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_token_type_embeddings(l_stack0_3_);  l_stack0_3_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
	        embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = inputs_embeds + token_type_embeddings;  inputs_embeds = token_type_embeddings = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
	        position_embeddings: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_position_embeddings(position_ids);  position_ids = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
	        embeddings += position_embeddings;  embeddings_1: "f32[1, 832, 768][638976, 768, 1]cpu" = embeddings;  embeddings = position_embeddings = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:308 in forward, code: embeddings = self.dropout(embeddings)
	        embeddings_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_dropout(embeddings_1);  embeddings_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
	        embeddings_3: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___embeddings_LayerNorm(embeddings_2);  embeddings_2 = None
	        return (embeddings_3, band_mask, from_mask, to_mask, blocked_encoder_mask)

V0627 17:31:03.091000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f7b6ff7875cdbc7ff1ea7b5f6bc39ed2"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[50358, 768][768, 1]cpu", arg1_1: "f32[2, 768][768, 1]cpu", arg2_1: "f32[4096, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "i64[1, 4096][4096, 1]cpu", arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
	        view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 13, 64])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
	        slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
	        slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -3);  slice_1 = None
	        slice_3: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
	        slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 2, -2);  slice_3 = None
	        slice_5: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
	        slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 3, -1);  slice_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
	        cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2);  slice_2 = slice_4 = slice_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
	        slice_7: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 0, 0, 9223372036854775807)
	        slice_8: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 2, -2);  slice_7 = None
	        unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_8, 3);  slice_8 = None
	        permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]);  unsqueeze = None
	        unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3);  cat = None
	        permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]);  unsqueeze_1 = None
	        mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
	        unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1);  mul = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
	        view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 832, 1])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
	        view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.view.default(arg7_1, [1, 1, 1, 832]);  arg7_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:282 in forward, code: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
	        slice_9: "i64[1, 4096][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 0, 9223372036854775807);  arg5_1 = None
	        slice_10: "i64[1, 832][4096, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 832);  slice_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
	        embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg0_1, arg6_1, 0);  arg0_1 = arg6_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
	        embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg1_1, arg8_1);  arg1_1 = arg8_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
	        add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1);  embedding = embedding_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
	        embedding_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(arg2_1, slice_10);  arg2_1 = slice_10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
	        add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, embedding_2);  add = embedding_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:308 in forward, code: embeddings = self.dropout(embeddings)
	        clone: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(add_1);  add_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
	        var_mean = torch.ops.aten.var_mean.correction(clone, [2], correction = 0, keepdim = True)
	        getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
	        getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1];  var_mean = None
	        add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
	        rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2);  add_2 = None
	        sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(clone, getitem_1);  clone = getitem_1 = None
	        mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
	        mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, arg3_1);  mul_1 = arg3_1 = None
	        add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, arg4_1);  mul_2 = arg4_1 = None
	        return (add_3, unsqueeze_2, view_1, view_2, view)

V0627 17:31:03.161000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "06e74f82fcfa9d791dc26355727799db"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
	        # No stacktrace found for following nodes
	        _frozen_param0: "f32[50358, 768][768, 1]cpu" = self._frozen_param0
	        _frozen_param1: "f32[2, 768][768, 1]cpu" = self._frozen_param1
	        _frozen_param3: "f32[768][1]cpu" = self._frozen_param3
	        _frozen_param4: "f32[768][1]cpu" = self._frozen_param4

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
	        _frozen_param6: "f32[1, 832, 768][638976, 768, 1]cpu" = self._frozen_param6

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
	        embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0);  _frozen_param0 = arg6_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
	        embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1);  _frozen_param1 = arg8_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
	        add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1);  embedding = embedding_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
	        add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, _frozen_param6);  add = _frozen_param6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
	        var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True)
	        getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
	        getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1];  var_mean = None
	        sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_1, getitem_1);  add_1 = getitem_1 = None
	        add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
	        rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2);  add_2 = None
	        mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
	        mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3);  mul_1 = _frozen_param3 = None
	        add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4);  mul_2 = _frozen_param4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
	        view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
	        slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 2, -2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
	        unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_4, 3)
	        permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]);  unsqueeze = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
	        slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 1, -3)
	        slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 3, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
	        cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2);  slice_2 = slice_4 = slice_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
	        unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3);  cat = None
	        permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]);  unsqueeze_1 = None
	        mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
	        unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1);  mul = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
	        view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
	        view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]);  arg7_1 = None
	        return (add_3, unsqueeze_2, view_1, view_2, view)

V0627 17:31:03.519000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/de/cdewao76edq6vrvflsagsrjktsdjwfpzvsaaft6tyecuomopfso3.py"}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "f30f2b373864eaff49baf96db8ab8cb7"}

	# AOT ID: ['3_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()
	_frozen_param0 = None  # device(type='cpu') torch.float32 (50358, 768) (768, 1) 7f2eb1d44630
	_frozen_param1 = None  # device(type='cpu') torch.float32 (2, 768) (768, 1) 7f2eb1d445e0
	_frozen_param3 = None  # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44540
	_frozen_param4 = None  # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44810
	_frozen_param6 = None  # device(type='cpu') torch.float32 (1, 832, 768) (638976, 768, 1) 7f2e3165ccc0


	cpp_fused_add_cat_embedding_mul_native_layer_norm_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const int64_t* in_ptr0,
	                       const float* in_ptr1,
	                       const int64_t* in_ptr2,
	                       const float* in_ptr3,
	                       const float* in_ptr4,
	                       const float* in_ptr5,
	                       const float* in_ptr6,
	                       const float* in_ptr7,
	                       const float* in_ptr8,
	                       float* out_ptr0,
	                       float* out_ptr1,
	                       float* out_ptr2,
	                       float* out_ptr3,
	                       float* out_ptr4,
	                       float* out_ptr5,
	                       float* out_ptr6,
	                       float* out_ptr7)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
	            {
	                {
	                    Welford<float> tmp_acc0 = Welford<float>();
	                    Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
	                    static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = in_ptr0[static_cast<long>(x0)];
	                        auto tmp10 = in_ptr2[static_cast<long>(x0)];
	                        auto tmp21 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1 + (768L*x0)), 16);
	                        auto tmp1 = 50358L;
	                        auto tmp2 = c10::convert<int64_t>(tmp1);
	                        auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
	                        auto tmp4 = tmp0 < 0;
	                        auto tmp5 = tmp4 ? tmp3 : tmp0;
	                        auto tmp6 = tmp5;
	                        auto tmp7 = c10::convert<int64_t>(tmp6);
	                        TORCH_CHECK((0 <= tmp7) & (tmp7 < 50358L), "index out of bounds: 0 <= tmp7 < 50358L");
	                        auto tmp9 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*tmp5)), 16);
	                        auto tmp11 = 2L;
	                        auto tmp12 = c10::convert<int64_t>(tmp11);
	                        auto tmp13 = decltype(tmp10)(tmp10 + tmp12);
	                        auto tmp14 = tmp10 < 0;
	                        auto tmp15 = tmp14 ? tmp13 : tmp10;
	                        auto tmp16 = tmp15;
	                        auto tmp17 = c10::convert<int64_t>(tmp16);
	                        TORCH_CHECK((0 <= tmp17) & (tmp17 < 2L), "index out of bounds: 0 <= tmp17 < 2L");
	                        auto tmp19 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (768L*tmp15)), 16);
	                        auto tmp20 = tmp9 + tmp19;
	                        auto tmp22 = tmp20 + tmp21;
	                        tmp22.store(out_ptr0 + static_cast<long>(x1 + (768L*x0)));
	                        tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp22, &weight_recps);
	                    }
	                    tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
	                    out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
	                    out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
	                    auto tmp1 = out_ptr1[static_cast<long>(x0)];
	                    auto tmp4 = out_ptr2[static_cast<long>(x0)];
	                    auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1), 16);
	                    auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr6 + static_cast<long>(x1), 16);
	                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
	                    auto tmp3 = tmp0 - tmp2;
	                    auto tmp5 = static_cast<float>(768.0);
	                    auto tmp6 = tmp4 / tmp5;
	                    auto tmp7 = static_cast<float>(1e-12);
	                    auto tmp8 = decltype(tmp6)(tmp6 + tmp7);
	                    auto tmp9 = 1 / std::sqrt(tmp8);
	                    auto tmp10 = at::vec::Vectorized<float>(tmp9);
	                    auto tmp11 = tmp3 * tmp10;
	                    auto tmp13 = tmp11 * tmp12;
	                    auto tmp15 = tmp13 + tmp14;
	                    tmp15.store(out_ptr3 + static_cast<long>(x1 + (768L*x0)));
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
	                {
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(64L + x1 + (64L*x0)), 16);
	                        tmp0.store(out_ptr4 + static_cast<long>(x1 + (192L*x0)));
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
	                {
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(128L + x1 + (64L*x0)), 16);
	                        tmp0.store(out_ptr5 + static_cast<long>(x1 + (192L*x0)));
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
	                {
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(192L + x1 + (64L*x0)), 16);
	                        tmp0.store(out_ptr6 + static_cast<long>(x1 + (192L*x0)));
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	                    {
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
	                        {
	                            auto tmp0 = in_ptr7[static_cast<long>(128L + x1 + (64L*x0))];
	                            auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr8 + static_cast<long>(x2 + (192L*x0)), 16);
	                            auto tmp2 = at::vec::Vectorized<float>(tmp0);
	                            auto tmp3 = tmp2 * tmp1;
	                            tmp3.store(out_ptr7 + static_cast<long>(x2 + (192L*x1) + (12288L*x0)));
	                        }
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg6_1, arg7_1, arg8_1 = args
	    args.clear()
	    assert_size_stride(arg6_1, (1, 832), (832, 1))
	    assert_size_stride(arg7_1, (1, 832), (832, 1))
	    assert_size_stride(arg8_1, (1, 832), (832, 1))
	    buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
	    buf1 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
	    buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
	    buf4 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
	    buf8 = empty_strided_cpu((1, 9, 192), (1728, 192, 1), torch.float32)
	    buf5 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 0)  # alias
	    buf6 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 64)  # alias
	    buf7 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 128)  # alias
	    buf9 = empty_strided_cpu((1, 9, 64, 192), (110592, 12288, 192, 1), torch.float32)
	    cpp_fused_add_cat_embedding_mul_native_layer_norm_0(arg6_1, _frozen_param0, arg8_1, _frozen_param1, _frozen_param6, _frozen_param3, _frozen_param4, arg7_1, buf8, buf0, buf1, buf2, buf4, buf5, buf6, buf7, buf9)
	    del arg6_1
	    del arg8_1
	    return (buf4, reinterpret_tensor(buf9, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 832, 1), (832, 832, 1, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 1, 832), (832, 832, 832, 1), 0), reinterpret_tensor(arg7_1, (1, 13, 64), (832, 64, 1), 0), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    global _frozen_param0
	    _frozen_param0 = rand_strided((50358, 768), (768, 1), device='cpu', dtype=torch.float32)
	    global _frozen_param1
	    _frozen_param1 = rand_strided((2, 768), (768, 1), device='cpu', dtype=torch.float32)
	    global _frozen_param3
	    _frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
	    global _frozen_param4
	    _frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
	    global _frozen_param6
	    _frozen_param6 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
	    arg6_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
	    arg7_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.float32)
	    arg8_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
	    fn = lambda: call([arg6_1, arg7_1, arg8_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:03.542000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:03.542000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1, "has_payload": "1dfadefa57d2d698b82df0a252ee757b"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839714901584)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self'].config, accessed_by=DictGetItemGuardAccessor(config)
	| | | | +- TYPE_MATCH: ___check_type_id(L['self'].config, 139839810624528)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].encoder, accessed_by=DictGetItemGuardAccessor(encoder)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].encoder, 139839713378016)
	| | | | | +- GuardManager: source=L['self'].encoder.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].encoder.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].encoder.training, 7685824)
	| | | | +- GuardManager: source=L['self'].embeddings, accessed_by=DictGetItemGuardAccessor(embeddings)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings, 139839713378208)
	| | | | | +- GuardManager: source=L['self'].embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].embeddings.__dict__)
	| | | | | | +- GuardManager: source=L['self'].embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].embeddings._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | | | | +- GuardManager: source=L['self'].embeddings.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.dropout, 139839202278704)
	| | | | | | | | +- GuardManager: source=L['self'].embeddings.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].embeddings.dropout.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.dropout.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.LayerNorm, 139839202278800)
	| | | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].embeddings.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.LayerNorm.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings, accessed_by=DictGetItemGuardAccessor(word_embeddings)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.word_embeddings, 139839202271840)
	| | | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].embeddings.word_embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.word_embeddings.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings, accessed_by=DictGetItemGuardAccessor(position_embeddings)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_embeddings, 139839202279184)
	| | | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].embeddings.position_embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_embeddings.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings, accessed_by=DictGetItemGuardAccessor(token_type_embeddings)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_embeddings, 139839202279328)
	| | | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].embeddings.token_type_embeddings.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.token_type_embeddings.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].embeddings._buffers, accessed_by=DictGetItemGuardAccessor(_buffers)
	| | | | | | | +- GuardManager: source=L['self'].embeddings.position_ids, accessed_by=DictGetItemGuardAccessor(position_ids)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.position_ids, 139838528701040)
	| | | | | | +- GuardManager: source=L['self'].embeddings._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
	| | | | | | +- GuardManager: source=L['self'].embeddings._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
	| | | | | | +- GuardManager: source=L['self'].embeddings._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
	| | | | | | +- GuardManager: source=L['self'].embeddings.rescale_embeddings, accessed_by=DictGetItemGuardAccessor(rescale_embeddings)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].embeddings.rescale_embeddings, 7685824)
	| | | | | | +- GuardManager: source=L['self'].embeddings._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
	| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
	| | | | +- EQUALS_MATCH: L['self'].block_size == 64
	| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
	| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
	| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
	| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
	| | +- LENGTH_CHECK: len(L['___stack0']) == 6
	| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
	| | | +- EQUALS_MATCH: L['___stack0'][0] == 13
	| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1)
	| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][1], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
	| | | +- NO_HASATTR: hasattr(L['___stack0'][1], '_dynamo_dynamic_indices') == False
	| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
	| | +- GuardManager: source=L['___stack0'][2], accessed_by=TupleGetItemGuardAccessor(2)
	| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][2], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
	| | | +- NO_HASATTR: hasattr(L['___stack0'][2], '_dynamo_dynamic_indices') == False
	| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
	| | +- GuardManager: source=L['___stack0'][3], accessed_by=TupleGetItemGuardAccessor(3)
	| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][3], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int64, device=None, requires_grad=False, size=[1, 832], stride=[832, 1])
	| | | +- NO_HASATTR: hasattr(L['___stack0'][3], '_dynamo_dynamic_indices') == False
	| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][1], L['___stack0'][2], L['___stack0'][3])
	| | +- GuardManager: source=L['___stack0'][4], accessed_by=TupleGetItemGuardAccessor(4)
	| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][4], 7636800)
	| | +- GuardManager: source=L['___stack0'][5], accessed_by=TupleGetItemGuardAccessor(5)
	| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][5], 7636800)
	| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
	| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
	| +- GuardManager: source=L['use_cache'], accessed_by=DictGetItemGuardAccessor(use_cache)
	| | +- ID_MATCH: ___check_obj_id(L['use_cache'], 7685824)
	| +- GuardManager: source=L['return_dict'], accessed_by=DictGetItemGuardAccessor(return_dict)
	| | +- ID_MATCH: ___check_obj_id(L['return_dict'], 7685856)
	| +- GuardManager: source=L['past_key_values'], accessed_by=DictGetItemGuardAccessor(past_key_values)
	| | +- ID_MATCH: ___check_obj_id(L['past_key_values'], 7636800)
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=L['output_hidden_states'], accessed_by=DictGetItemGuardAccessor(output_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['output_hidden_states'], 7685824)
	| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
	| +- GuardManager: source=L['past_key_values_length'], accessed_by=DictGetItemGuardAccessor(past_key_values_length)
	| | +- EQUALS_MATCH: L['past_key_values_length'] == 0
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
	| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
	| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672)
	| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568)
	| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
	| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
	| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks

V0627 17:31:03.543000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "5/0", "frame_key": "10", "co_name": "torch_dynamo_resume_in_forward_at_2077", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 2077, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 57, "shape_env_guard_count": 0, "graph_op_count": 18, "graph_node_count": 23, "graph_input_count": 3, "start_time": 1719534662.9121282, "entire_frame_compile_time_s": 0.6307895183563232, "backend_compile_time_s": 0.49609994888305664, "inductor_compile_time_s": 0.37875938415527344, "code_gen_time_s": 0.3245351314544678, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.08347654342651367, "has_guarded_code": true}, "frame_id": 5, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.544000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 19, "size": 442368}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.565000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 0, "source": "L['band_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 19, "size": 2555904}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.566000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 1, "source": "L['hidden_states']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.567000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 19, "size": 3328}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.568000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 2, "source": "L['from_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.570000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 19}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.570000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 19, "id": 5, "source": "L['to_mask']"}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.600000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "6/0", "frame_key": "11", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1578, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534663.5449224, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.05504441261291504, "has_guarded_code": false}, "frame_id": 6, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.600000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.607000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 20, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.608000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.608000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 20, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.609000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 20, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.610000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.611000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.612000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 20}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.612000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 20, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 21, "size": 2555904}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.643000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 21, "size": 442368}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.644000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 21, "size": 3328}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.645000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.646000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.647000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.648000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 21}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.648000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 21, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.656000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:03.656000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1, "has_payload": "76aa2c3aac969b0b973556e5e5d20d8b"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202275632)
	| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].attention.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention.training, 7685824)
	| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
	| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
	| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
	| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
	| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
	| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
	| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
	| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
	| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
	| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
	| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value)
	| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800)
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask)
	| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
	| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
	| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)

V0627 17:31:03.657000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "7/0", "frame_key": "12", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534663.6010149, "entire_frame_compile_time_s": 0.05594229698181152, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.039438724517822266, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.657000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 22, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.659000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 22, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.660000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.661000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 22, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.662000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.664000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 22}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.664000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 22, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 23, "size": 442368}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.694000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 23, "size": 2555904}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.695000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.696000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 23, "size": 3328}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.696000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.697000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.697000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.698000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.699000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.701000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 23}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.701000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 23, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.704000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
	        l_band_mask_ = L_band_mask_
	        l_from_mask_ = L_from_mask_
	        l_to_mask_ = L_to_mask_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1383 in forward, code: band_mask = band_mask.to(hidden_states.dtype)
	        band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = l_band_mask_.to(torch.float32);  l_band_mask_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1385 in forward, code: from_mask = from_mask.to(hidden_states.dtype)
	        from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_from_mask_.to(torch.float32);  l_from_mask_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1387 in forward, code: to_mask = to_mask.to(hidden_states.dtype)
	        to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_to_mask_.to(torch.float32);  l_to_mask_ = None
	        return (band_mask, from_mask, to_mask)

V0627 17:31:03.718000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
	        return (arg0_1, arg1_1, arg2_1)

V0627 17:31:03.731000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:03.731000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1, "has_payload": "6f76e9e822f6dc2ebb0dbc0f0100927d"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202274384)
	| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].self.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].self.training, 7685824)
	| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
	| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
	| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
	| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
	| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
	| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
	| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
	| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
	| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
	| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
	| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']

V0627 17:31:03.732000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "8/0", "frame_key": "13", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534663.657894, "entire_frame_compile_time_s": 0.07398724555969238, "backend_compile_time_s": 0.02206587791442871, "inductor_compile_time_s": 0.0003921985626220703, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03484821319580078, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.732000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 25, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 25}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.733000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 25, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 26, "size": 2555904}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.752000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 26, "size": 442368}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.763000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 26, "size": 3328}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.764000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.765000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.765000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.766000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 26}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.766000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 26, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.768000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        l_hidden_states_ = L_hidden_states_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64);  l__self___query = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3);  x = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64);  l__self___key = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3);  x_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_);  l_hidden_states_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64);  l__self___value = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3);  x_2 = None
	        return (query_layer, key_layer, value_layer)

V0627 17:31:03.817000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16);  arg1_1 = None
	        convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16);  arg0_1 = None
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
	        view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]);  convert_element_type_2 = None
	        permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]);  convert_element_type_1 = None
	        addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute);  convert_element_type = view = permute = None
	        view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]);  addmm = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]);  view_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]);  view_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16);  arg3_1 = None
	        convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16);  arg2_1 = None
	        convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
	        view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]);  convert_element_type_8 = None
	        permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]);  convert_element_type_7 = None
	        addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2);  convert_element_type_6 = view_3 = permute_2 = None
	        view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]);  addmm_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]);  view_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]);  view_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16);  arg5_1 = None
	        convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16);  arg4_1 = None
	        convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16);  arg6_1 = None
	        view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]);  convert_element_type_14 = None
	        permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]);  convert_element_type_13 = None
	        addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4);  convert_element_type_12 = view_6 = permute_4 = None
	        view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]);  addmm_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]);  view_7 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]);  view_8 = None
	        return (permute_1, permute_3, permute_5)

V0627 17:31:03.886000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        _frozen_param6: "bf16[768][1]cpu" = self._frozen_param6

	        # No stacktrace found for following nodes
	        _frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        _frozen_param8: "bf16[768][1]cpu" = self._frozen_param8

	        # No stacktrace found for following nodes
	        _frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        _frozen_param10: "bf16[768][1]cpu" = self._frozen_param10

	        # No stacktrace found for following nodes
	        _frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16);  arg6_1 = None
	        _linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], '');  _frozen_param12 = _frozen_param6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]);  _linear_pointwise_default_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]);  view_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        _linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], '');  _frozen_param13 = _frozen_param8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]);  _linear_pointwise_default_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]);  view_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        _linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], '');  convert_element_type_2 = _frozen_param14 = _frozen_param10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]);  _linear_pointwise_default_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]);  view_8 = None
	        return (permute_1, permute_3, permute_5)

V0627 17:31:03.909000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/rm/crmmdl3pvsdue2ht6qffev3qnvhhdsc4zixorhqtjreztfur5zhi.py"}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "a40072c55bb96853547fea577aa47ba2"}

	# AOT ID: ['5_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()
	_frozen_param6 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311efab0
	_frozen_param12 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311c2750
	_frozen_param8 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311b8770
	_frozen_param13 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311cefc0
	_frozen_param10 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e311dbbf0
	_frozen_param14 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e311ac090


	cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const float* in_ptr0,
	                       bfloat16* out_ptr0)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
	            {
	                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
	                auto tmp1 = at::vec::convert<bfloat16>(tmp0);
	                tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
	            }
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg6_1, = args
	    args.clear()
	    assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
	    buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
	    cpp_fused__to_copy_0(arg6_1, buf0)
	    del arg6_1
	    buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
	    buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
	    buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
	    return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    global _frozen_param6
	    _frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param12
	    _frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param8
	    _frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param13
	    _frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param10
	    _frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param14
	    _frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
	    fn = lambda: call([arg6_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:03.922000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:03.923000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1, "has_payload": "300400f770725170203fcbe28e6ee223"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202273568)
	| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].key.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].key.training, 7685824)
	| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202273616)
	| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].query.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].query.training, 7685824)
	| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202273040)
	| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].value.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].value.training, 7685824)
	| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed)
	| | | | +- EQUALS_MATCH: L['self'].seed == 0
	| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
	| | | | +- EQUALS_MATCH: L['self'].block_size == 64
	| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks)
	| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3
	| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size)
	| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64
	| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads)
	| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12
	| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
	| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
	| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
	| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
	| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
	| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
	| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
	| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']

V0627 17:31:03.923000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "9/0", "frame_key": "14", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534663.7326608, "entire_frame_compile_time_s": 0.19047832489013672, "backend_compile_time_s": 0.14537477493286133, "inductor_compile_time_s": 0.0376286506652832, "code_gen_time_s": 0.016646862030029297, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.018494129180908203, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.925000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:03.985000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:03.986000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1, "has_payload": "f6f5661bfad0dc293ecc9ef35ede39a0"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed)
	| | +- EQUALS_MATCH: L['seed'] == 0
	| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
	| | +- EQUALS_MATCH: L['batch_size'] == 1
	| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
	| | +- EQUALS_MATCH: L['to_seq_len'] == 832
	| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
	| | +- EQUALS_MATCH: L['from_seq_len'] == 832
	| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
	| | +- EQUALS_MATCH: L['to_block_size'] == 64
	| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
	| | +- EQUALS_MATCH: L['from_block_size'] == 64
	| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size)
	| | +- EQUALS_MATCH: L['attention_head_size'] == 64
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
	| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
	| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random)
	| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464)
	| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed)
	| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264)
	| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math)
	| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744)
	| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
	| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344)

V0627 17:31:03.986000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "10/0", "frame_key": "15", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 516, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.9255476, "entire_frame_compile_time_s": 0.06077218055725098, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.008862972259521484, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:03.987000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.076000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:04.077000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1, "has_payload": "f3df28d4d21dab674ac56179543067e7"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
	| | +- EQUALS_MATCH: L['n_heads'] == 12
	| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
	| | +- EQUALS_MATCH: L['to_seq_len'] == 832
	| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
	| | +- EQUALS_MATCH: L['from_seq_len'] == 832
	| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
	| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
	| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
	| | +- EQUALS_MATCH: L['to_block_size'] == 64
	| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
	| | +- EQUALS_MATCH: L['from_block_size'] == 64
	| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
	| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800)
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['__builtins_dict___37'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___37)
	| | | +- GuardManager: source=G['__builtins_dict___37']['int'], accessed_by=DictGetItemGuardAccessor(int)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___37']['int'], 7648640)

V0627 17:31:04.077000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "11/0", "frame_key": "16", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534663.987453, "entire_frame_compile_time_s": 0.08972334861755371, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.02129364013671875, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:04.078000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 12, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.094000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"wrapped_array": [2], "plan_block_length": [2]}}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "9f82d9593d608d32ba61e6298aeb3649"}
	class GraphModule(torch.nn.Module):
	    def forward(self):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
	        wrapped_array: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_array([704, 832])
	        plan_block_length: "i64[2][1]cpu" = torch__dynamo_utils_wrapped_floordiv(wrapped_array, 64);  wrapped_array = None
	        return (plan_block_length,)

V0627 17:31:04.106000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "1861734e40a5f61860344b326195085c"}
	class <lambda>(torch.nn.Module):
	    def forward(self):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
	        _tensor_constant0 = self._tensor_constant0
	        lift_fresh_copy: "i64[2][1]cpu" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
	        clone: "i64[2][1]cpu" = torch.ops.aten.clone.default(lift_fresh_copy);  lift_fresh_copy = None
	        div: "i64[2][1]cpu" = torch.ops.aten.div.Tensor_mode(clone, 64, rounding_mode = 'floor');  clone = None
	        return (div,)

V0627 17:31:04.122000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:04.122000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1, "has_payload": "c1fc05dff62c1bc070ea06a12430d940"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
	| | +- EQUALS_MATCH: L['num_heads'] == 12
	| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
	| | +- EQUALS_MATCH: L['to_block_size'] == 64
	| +- GuardManager: source=L['to_seq_length'], accessed_by=DictGetItemGuardAccessor(to_seq_length)
	| | +- EQUALS_MATCH: L['to_seq_length'] == 832
	| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
	| | +- EQUALS_MATCH: L['from_block_size'] == 64
	| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length)
	| | +- EQUALS_MATCH: L['from_seq_length'] == 832
	| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
	| | +- TYPE_MATCH: ___check_type_id(L['plan_from_length'], 7650400)
	| | +- LENGTH_CHECK: len(L['plan_from_length']) == 2
	| | +- GuardManager: source=L['plan_from_length'][0], accessed_by=ListGetItemGuardAccessor(0)
	| | | +- EQUALS_MATCH: L['plan_from_length'][0] == 704
	| | +- GuardManager: source=L['plan_from_length'][1], accessed_by=ListGetItemGuardAccessor(1)
	| | | +- EQUALS_MATCH: L['plan_from_length'][1] == 832
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
	| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
	| | | +- GuardManager: source=G['np'].array, accessed_by=GetAttrGuardAccessor(array)
	| | | | +- ID_MATCH: ___check_obj_id(G['np'].array, 139845228959664)
	| | +- GuardManager: source=G['__builtins_dict___40'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___40)
	| | | +- GuardManager: source=G['__builtins_dict___40']['list'], accessed_by=DictGetItemGuardAccessor(list)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['list'], 7650400)
	| | | +- GuardManager: source=G['__builtins_dict___40']['range'], accessed_by=DictGetItemGuardAccessor(range)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['range'], 7632448)
	| | | +- GuardManager: source=G['__builtins_dict___40']['enumerate'], accessed_by=DictGetItemGuardAccessor(enumerate)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___40']['enumerate'], 7513024)
	| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336)

V0627 17:31:04.122000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "12/0", "frame_key": "17", "co_name": "_bigbird_block_rand_mask_with_head", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1111, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 19, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 3, "graph_input_count": 0, "start_time": 1719534664.0783854, "entire_frame_compile_time_s": 0.04439258575439453, "backend_compile_time_s": 0.01859426498413086, "inductor_compile_time_s": 0.00021767616271972656, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.00835871696472168, "has_guarded_code": true}, "frame_id": 12, "frame_compile_id": 0, "attempt": 1}
V0627 17:31:04.123000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.127000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "13/0", "frame_key": "18", "co_name": "<listcomp>", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": null, "shape_env_guard_count": null, "graph_op_count": null, "graph_node_count": null, "graph_input_count": null, "start_time": 1719534664.1237168, "entire_frame_compile_time_s": null, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0032224655151367188, "has_guarded_code": false}, "frame_id": 13, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.127000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dafc0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.131000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c3510>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.133000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 3, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db3dd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.135000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 4, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1fd0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.136000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 5, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311daf70>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.138000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 6, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.139000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.139000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311b8ea0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.140000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 7, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ba200>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.141000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 8, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119dd50>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.142000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 9, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c6d0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.144000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 10, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.145000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.146000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c4a0>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.146000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 11, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 12, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.147000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 12, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.148000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 36, "size": 156}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.149000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4df30>", "describer_id": 36}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.149000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 36, "id": 13, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.153000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"):
	        l_stack0_0_ = L_stack0_0_
	        l_stack0_1_ = L_stack0_1_
	        l_stack0_2_ = L_stack0_2_
	        l_stack0_3_ = L_stack0_3_
	        l_stack0_4_ = L_stack0_4_
	        l_stack0_5_ = L_stack0_5_
	        l_stack0_6_ = L_stack0_6_
	        l_stack0_7_ = L_stack0_7_
	        l_stack0_8_ = L_stack0_8_
	        l_stack0_9_ = L_stack0_9_
	        l_stack0_10_ = L_stack0_10_
	        l_stack0_11_ = L_stack0_11_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
	        wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_0_ = None
	        wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_1_ = None
	        wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_2_ = None
	        wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_3_ = None
	        wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_4_ = None
	        wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_5_ = None
	        wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_6_ = None
	        wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_7_ = None
	        wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_8_ = None
	        wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_9_ = None
	        wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_10_ = None
	        wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_11_ = None
	        return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11)

V0627 17:31:04.205000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
	        slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12);  arg0_1 = None
	        slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807);  slice_1 = None
	        slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12);  arg1_1 = None
	        slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807);  slice_3 = None
	        slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12);  arg2_1 = None
	        slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807);  slice_5 = None
	        slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12);  arg3_1 = None
	        slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807);  slice_7 = None
	        slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12);  arg4_1 = None
	        slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807);  slice_9 = None
	        slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12);  arg5_1 = None
	        slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807);  slice_11 = None
	        slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12);  arg6_1 = None
	        slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807);  slice_13 = None
	        slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12);  arg7_1 = None
	        slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807);  slice_15 = None
	        slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12);  arg8_1 = None
	        slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807);  slice_17 = None
	        slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12);  arg9_1 = None
	        slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807);  slice_19 = None
	        slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12);  arg10_1 = None
	        slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807);  slice_21 = None
	        slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12);  arg11_1 = None
	        slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807);  slice_23 = None
	        return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24)

V0627 17:31:04.236000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
	        slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12);  arg0_1 = None
	        slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12);  arg1_1 = None
	        slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12);  arg2_1 = None
	        slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12);  arg3_1 = None
	        slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12);  arg4_1 = None
	        slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12);  arg5_1 = None
	        slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12);  arg6_1 = None
	        slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12);  arg7_1 = None
	        slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12);  arg8_1 = None
	        slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12);  arg9_1 = None
	        slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12);  arg10_1 = None
	        slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12);  arg11_1 = None
	        return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)

V0627 17:31:04.250000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ki/ckiwvz5uks7efnnd5ew4d76276bcutmxkkfh6ozi5dgumqv3m2qc.py"}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "6641116284eedbc64e23effbbbfe40e6"}

	# AOT ID: ['7_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
	    args.clear()
	    assert_size_stride(arg0_1, (13, 3), (3, 1))
	    assert_size_stride(arg1_1, (13, 3), (3, 1))
	    assert_size_stride(arg2_1, (13, 3), (3, 1))
	    assert_size_stride(arg3_1, (13, 3), (3, 1))
	    assert_size_stride(arg4_1, (13, 3), (3, 1))
	    assert_size_stride(arg5_1, (13, 3), (3, 1))
	    assert_size_stride(arg6_1, (13, 3), (3, 1))
	    assert_size_stride(arg7_1, (13, 3), (3, 1))
	    assert_size_stride(arg8_1, (13, 3), (3, 1))
	    assert_size_stride(arg9_1, (13, 3), (3, 1))
	    assert_size_stride(arg10_1, (13, 3), (3, 1))
	    assert_size_stride(arg11_1, (13, 3), (3, 1))
	    return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:04.258000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:04.259000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0, "has_payload": "bb9e1aaf4decc7f300fbb51ff6f34967"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
	| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
	| | +- LENGTH_CHECK: len(L['___stack0']) == 12
	| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
	| | +- EQUALS_MATCH: L['num_heads'] == 12
	| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks)
	| | +- EQUALS_MATCH: L['num_blocks'] == 13
	| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top)
	| | +- EQUALS_MATCH: L['global_block_top'] == 1
	| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom)
	| | +- EQUALS_MATCH: L['global_block_bottom'] == 1
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['__builtins_dict___44'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___44)
	| | | +- GuardManager: source=G['__builtins_dict___44']['range'], accessed_by=DictGetItemGuardAccessor(range)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___44']['range'], 7632448)

V0627 17:31:04.259000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "14/0", "frame_key": "19", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534664.1278515, "entire_frame_compile_time_s": 0.13145732879638672, "backend_compile_time_s": 0.09916210174560547, "inductor_compile_time_s": 0.022524356842041016, "code_gen_time_s": 0.003596067428588867, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.260000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b7060>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.267000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ad1c0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.268000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119c810>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.269000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e3119ef70>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c58a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30db1d00>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.271000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310b9f30>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.272000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311ac310>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.273000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310bb100>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.274000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4d580>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30d4c450>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 38, "size": 132}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e310f5e90>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.276000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 12, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c6840>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.279000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 13, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.283000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 13, "describer_id": 38, "size": 3328}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.283000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 17, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.284000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.284000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 16, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.293000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 14, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.294000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 31, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c73d0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.294000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 31, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 15, "describer_id": 38, "size": 1277952}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 32, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311c7470>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.295000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 32, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.316000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 50, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.317000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 50, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 16, "describer_id": 38, "size": 442368}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 124, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.378000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 124, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.439000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 182, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 17, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 38}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.439000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 38, "id": 182, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:04.456000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
	        l_stack0_0_ = L_stack0_0_
	        l_stack0_1_ = L_stack0_1_
	        l_stack0_2_ = L_stack0_2_
	        l_stack0_3_ = L_stack0_3_
	        l_stack0_4_ = L_stack0_4_
	        l_stack0_5_ = L_stack0_5_
	        l_stack0_6_ = L_stack0_6_
	        l_stack0_7_ = L_stack0_7_
	        l_stack0_8_ = L_stack0_8_
	        l_stack0_9_ = L_stack0_9_
	        l_stack0_10_ = L_stack0_10_
	        l_stack0_11_ = L_stack0_11_
	        l_query_layer_ = L_query_layer_
	        l_from_blocked_mask_ = L_from_blocked_mask_
	        l_key_layer_ = L_key_layer_
	        l_value_layer_ = L_value_layer_
	        l_to_mask_ = L_to_mask_
	        l_band_mask_ = L_band_mask_
	        l_from_mask_ = L_from_mask_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
	        rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0);  l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
	        rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64);  rand_attn = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
	        unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0)
	        rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.cat([rand_attn_1], dim = 0);  rand_attn_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0]
	        i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        flatten: "i64[396][1]cpu" = i1.flatten();  i1 = None
	        getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten];  p1 = flatten = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]);  getitem_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
	        rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192);  rand_mask = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
	        getitem_3: "f32[1, 11, 64][832, 64, 1]cpu" = l_from_blocked_mask_[(slice(None, None, None), slice(1, -1, None))];  l_from_blocked_mask_ = None
	        rand_mask_2: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.functional.einsum('blq,bhlk->bhlqk', getitem_3, rand_mask_1);  getitem_3 = rand_mask_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
	        blocked_query_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_query_layer_.view(1, 12, 13, 64, -1);  l_query_layer_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
	        blocked_key_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_key_layer_.view(1, 12, 13, 64, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
	        blocked_value_matrix: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = l_value_layer_.view(1, 12, 13, 64, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
	        shift: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu'))

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
	        div: "i64[396][1]cpu" = torch.div(shift, 33, rounding_mode = 'floor');  shift = None
	        indices_shift: "i64[396][1]cpu" = div * 13;  div = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
	        view_4: "i64[396][1]cpu" = rand_attn_2.view(-1)
	        flattened_indices: "i64[396][1]cpu" = view_4 + indices_shift;  view_4 = indices_shift = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
	        flattened_params: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_key_matrix.reshape(-1, 64, 64)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
	        out_flattened: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params.index_select(0, flattened_indices);  flattened_params = flattened_indices = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
	        out: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened.reshape((1, 12, 33, 64, 64));  out_flattened = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
	        gathered_key: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out.view(1, 12, 11, 192, -1);  out = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
	        shift_1: "i64[396][1]cpu" = torch.arange(396, device = device(type='cpu'))

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
	        div_1: "i64[396][1]cpu" = torch.div(shift_1, 33, rounding_mode = 'floor');  shift_1 = None
	        indices_shift_1: "i64[396][1]cpu" = div_1 * 13;  div_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
	        view_6: "i64[396][1]cpu" = rand_attn_2.view(-1)
	        flattened_indices_1: "i64[396][1]cpu" = view_6 + indices_shift_1;  view_6 = indices_shift_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
	        flattened_params_1: "bf16[156, 64, 64][4096, 64, 1]cpu" = blocked_value_matrix.reshape(-1, 64, 64)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
	        out_flattened_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = flattened_params_1.index_select(0, flattened_indices_1);  flattened_params_1 = flattened_indices_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
	        out_1: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = out_flattened_1.reshape((1, 12, 33, 64, 64));  out_flattened_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
	        gathered_value: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = out_1.view(1, 12, 11, 192, -1);  out_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
	        getitem_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        reshape_4: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_4.reshape((-1, 64, 64));  getitem_4 = None
	        reshape_5: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64))
	        transpose: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_5.transpose(1, 2);  reshape_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_4, transpose);  reshape_4 = transpose = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        first_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm.view((1, 12, 64, 832));  bmm = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
	        first_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product * 0.125;  first_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
	        sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_
	        mul_3: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub * -10000.0;  sub = None
	        first_product_1 += mul_3;  first_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = first_product_1;  first_product_1 = mul_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
	        first_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(first_product_2, dim = -1);  first_product_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        reshape_6: "bf16[12, 64, 832][53248, 832, 1]cpu" = first_attn_weights.reshape((-1, 64, 832));  first_attn_weights = None
	        reshape_7: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64))
	        bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_6, reshape_7);  reshape_6 = reshape_7 = None
	        first_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_1.view((1, 12, 64, 64));  bmm_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
	        unsqueeze__1: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = first_context_layer.unsqueeze_(2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
	        getitem_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
	        getitem_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 1)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
	        getitem_7: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 2)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
	        getitem_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
	        getitem_9: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
	        second_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_5, getitem_6, getitem_7, getitem_8, getitem_9], dim = 2);  getitem_5 = getitem_6 = getitem_7 = getitem_8 = getitem_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
	        getitem_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
	        getitem_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 1)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
	        getitem_12: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 2)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
	        getitem_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
	        getitem_14: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
	        second_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_10, getitem_11, getitem_12, getitem_13, getitem_14], dim = 2);  getitem_10 = getitem_11 = getitem_12 = getitem_13 = getitem_14 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
	        getitem_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), 1)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        reshape_8: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_15.reshape((-1, 64, 64));  getitem_15 = None
	        reshape_9: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_key_mat.reshape((-1, 448, 64));  second_key_mat = None
	        transpose_1: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_9.transpose(1, 2);  reshape_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_8, transpose_1);  reshape_8 = transpose_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        second_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_2.view((1, 12, 64, 448));  bmm_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
	        getitem_16: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 192, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
	        getitem_17: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
	        new_ones: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
	        second_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.cat([getitem_16, getitem_17, new_ones], dim = 3);  getitem_16 = getitem_17 = new_ones = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
	        new_ones_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
	        getitem_18: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
	        second_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.cat([new_ones_1, getitem_18], dim = 3);  new_ones_1 = getitem_18 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
	        second_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product * 0.125;  second_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
	        minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_seq_pad, second_rand_pad);  second_seq_pad = second_rand_pad = None
	        sub_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum;  minimum = None
	        mul_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_1 * -10000.0;  sub_1 = None
	        second_product_1 += mul_5;  second_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_product_1;  second_product_1 = mul_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
	        second_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_product_2, dim = -1);  second_product_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        reshape_10: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_attn_weights.reshape((-1, 64, 448));  second_attn_weights = None
	        reshape_11: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_value_mat.reshape((-1, 448, 64));  second_value_mat = None
	        bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_10, reshape_11);  reshape_10 = reshape_11 = None
	        second_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_3.view((1, 12, 64, 64));  bmm_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
	        unsqueeze__2: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_context_layer.unsqueeze_(2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
	        getitem_19: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))]
	        getitem_20: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
	        getitem_21: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
	        exp_blocked_key_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.cat([getitem_19, getitem_20, getitem_21], dim = 3);  getitem_19 = getitem_20 = getitem_21 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
	        getitem_22: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(1, -3, None))]
	        getitem_23: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]
	        getitem_24: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), slice(3, -1, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
	        exp_blocked_value_matrix: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.cat([getitem_22, getitem_23, getitem_24], dim = 3);  getitem_22 = getitem_23 = getitem_24 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
	        middle_query_matrix: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), slice(2, -2, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        reshape_12: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64))
	        reshape_13: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_key_matrix.reshape((-1, 192, 64));  exp_blocked_key_matrix = None
	        transpose_2: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_13.transpose(1, 2);  reshape_13 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_12, transpose_2);  reshape_12 = transpose_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        inner_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_4.view((1, 12, 9, 64, 192));  bmm_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
	        inner_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product * 0.125;  inner_band_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
	        getitem_26: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        reshape_14: "bf16[108, 64, 64][4096, 64, 1]cpu" = middle_query_matrix.reshape((-1, 64, 64))
	        reshape_15: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_26.reshape((-1, 192, 64));  getitem_26 = None
	        transpose_3: "bf16[108, 64, 192][12288, 1, 64]cpu" = reshape_15.transpose(1, 2);  reshape_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.bmm(reshape_14, transpose_3);  reshape_14 = transpose_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        rand_band_product: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = bmm_5.view((1, 12, 9, 64, 192));  bmm_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
	        rand_band_product_1: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product * 0.125;  rand_band_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:718 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]
	        getitem_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
	        first_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_27);  getitem_27 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
	        first_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product * 0.125;  first_band_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:724 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]
	        getitem_28: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
	        last_band_product: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqd,bhkd->bhlqk', middle_query_matrix, getitem_28);  middle_query_matrix = getitem_28 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
	        last_band_product_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product * 0.125;  last_band_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
	        sub_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = 1.0 - l_band_mask_;  l_band_mask_ = None
	        mul_10: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = sub_2 * -10000.0;  sub_2 = None
	        inner_band_product_1 += mul_10;  inner_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = inner_band_product_1;  inner_band_product_1 = mul_10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
	        getitem_29: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
	        unsqueeze: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_29.unsqueeze(3);  getitem_29 = None
	        sub_3: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze;  unsqueeze = None
	        mul_11: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_3 * -10000.0;  sub_3 = None
	        first_band_product_1 += mul_11;  first_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = first_band_product_1;  first_band_product_1 = mul_11 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
	        getitem_30: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))]
	        unsqueeze_1: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = getitem_30.unsqueeze(3);  getitem_30 = None
	        sub_4: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = 1.0 - unsqueeze_1;  unsqueeze_1 = None
	        mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = sub_4 * -10000.0;  sub_4 = None
	        last_band_product_1 += mul_12;  last_band_product_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = last_band_product_1;  last_band_product_1 = mul_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
	        getitem_31: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]
	        sub_5: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = 1.0 - getitem_31;  getitem_31 = None
	        mul_13: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = sub_5 * -10000.0;  sub_5 = None
	        rand_band_product_1 += mul_13;  rand_band_product_2: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = rand_band_product_1;  rand_band_product_1 = mul_13 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
	        band_product: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.cat([first_band_product_2, inner_band_product_2, rand_band_product_2, last_band_product_2], dim = -1);  first_band_product_2 = inner_band_product_2 = rand_band_product_2 = last_band_product_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
	        attn_weights: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.nn.functional.softmax(band_product, dim = -1);  band_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
	        getitem_32: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(64, 256, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        reshape_16: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_32.reshape((-1, 64, 192));  getitem_32 = None
	        reshape_17: "bf16[108, 192, 64][12288, 64, 1]cpu" = exp_blocked_value_matrix.reshape((-1, 192, 64));  exp_blocked_value_matrix = None
	        bmm_6: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_16, reshape_17);  reshape_16 = reshape_17 = None
	        context_layer: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_6.view((1, 12, 9, 64, 64));  bmm_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
	        getitem_33: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(256, -64, None))]
	        getitem_34: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), slice(1, -1, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        reshape_18: "bf16[108, 64, 192][32768, 512, 1]cpu" = getitem_33.reshape((-1, 64, 192));  getitem_33 = None
	        reshape_19: "bf16[108, 192, 64][12288, 64, 1]cpu" = getitem_34.reshape((-1, 192, 64));  getitem_34 = None
	        bmm_7: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_18, reshape_19);  reshape_18 = reshape_19 = None
	        view_15: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = bmm_7.view((1, 12, 9, 64, 64));  bmm_7 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
	        context_layer += view_15;  context_layer_1: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer;  context_layer = view_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
	        getitem_35: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]
	        getitem_36: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
	        einsum_3: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_35, getitem_36);  getitem_35 = getitem_36 = None
	        context_layer_1 += einsum_3;  context_layer_2: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_1;  context_layer_1 = einsum_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
	        getitem_37: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = attn_weights[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-64, None, None))];  attn_weights = None
	        getitem_38: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
	        einsum_4: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.functional.einsum('bhlqk,bhkd->bhlqd', getitem_37, getitem_38);  getitem_37 = getitem_38 = None
	        context_layer_2 += einsum_4;  context_layer_3: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = context_layer_2;  context_layer_2 = einsum_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:775 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
	        getitem_39: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
	        getitem_40: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -3)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
	        getitem_41: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -2)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:778 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
	        getitem_42: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_key_matrix[(slice(None, None, None), slice(None, None, None), -1)];  blocked_key_matrix = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
	        getitem_43: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_key[(slice(None, None, None), slice(None, None, None), -1)];  gathered_key = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
	        second_last_key_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_39, getitem_40, getitem_41, getitem_42, getitem_43], dim = 2);  getitem_39 = getitem_40 = getitem_41 = getitem_42 = getitem_43 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:785 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
	        getitem_44: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), 0)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
	        getitem_45: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -3)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
	        getitem_46: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -2)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:788 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
	        getitem_47: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_value_matrix[(slice(None, None, None), slice(None, None, None), -1)];  blocked_value_matrix = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
	        getitem_48: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = gathered_value[(slice(None, None, None), slice(None, None, None), -1)];  gathered_value = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
	        second_last_value_mat: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.cat([getitem_44, getitem_45, getitem_46, getitem_47, getitem_48], dim = 2);  getitem_44 = getitem_45 = getitem_46 = getitem_47 = getitem_48 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
	        getitem_49: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -2)]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        reshape_20: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_49.reshape((-1, 64, 64));  getitem_49 = None
	        reshape_21: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_key_mat.reshape((-1, 448, 64));  second_last_key_mat = None
	        transpose_4: "bf16[12, 64, 448][28672, 1, 64]cpu" = reshape_21.transpose(1, 2);  reshape_21 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_8: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.bmm(reshape_20, transpose_4);  reshape_20 = transpose_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        second_last_product: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = bmm_8.view((1, 12, 64, 448));  bmm_8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:798 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size],
	        getitem_50: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(None, 64, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
	        getitem_51: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = l_to_mask_[(slice(None, None, None), slice(None, None, None), slice(None, None, None), slice(-192, None, None))]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:800 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
	        new_ones_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = l_to_mask_.new_ones([1, 1, 1, 192])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
	        second_last_seq_pad: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.cat([getitem_50, getitem_51, new_ones_2], dim = 3);  getitem_50 = getitem_51 = new_ones_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:806 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
	        new_ones_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = rand_mask_2.new_ones([1, 12, 64, 256])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
	        getitem_52: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = rand_mask_2[(slice(None, None, None), slice(None, None, None), -1)];  rand_mask_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
	        second_last_rand_pad: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.cat([new_ones_3, getitem_52], dim = 3);  new_ones_3 = getitem_52 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
	        second_last_product_1: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product * 0.125;  second_last_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
	        minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.minimum(second_last_seq_pad, second_last_rand_pad);  second_last_seq_pad = second_last_rand_pad = None
	        sub_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = 1.0 - minimum_1;  minimum_1 = None
	        mul_15: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = sub_6 * -10000.0;  sub_6 = None
	        second_last_product_1 += mul_15;  second_last_product_2: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = second_last_product_1;  second_last_product_1 = mul_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
	        second_last_attn_weights: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.nn.functional.softmax(second_last_product_2, dim = -1);  second_last_product_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        reshape_22: "bf16[12, 64, 448][28672, 448, 1]cpu" = second_last_attn_weights.reshape((-1, 64, 448));  second_last_attn_weights = None
	        reshape_23: "bf16[12, 448, 64][28672, 64, 1]cpu" = second_last_value_mat.reshape((-1, 448, 64));  second_last_value_mat = None
	        bmm_9: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_22, reshape_23);  reshape_22 = reshape_23 = None
	        second_last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_9.view((1, 12, 64, 64));  bmm_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
	        unsqueeze__3: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = second_last_context_layer.unsqueeze_(2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
	        getitem_53: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = blocked_query_matrix[(slice(None, None, None), slice(None, None, None), -1)];  blocked_query_matrix = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        reshape_24: "bf16[12, 64, 64][64, 768, 1]cpu" = getitem_53.reshape((-1, 64, 64));  getitem_53 = None
	        reshape_25: "bf16[12, 832, 64][64, 768, 1]cpu" = l_key_layer_.reshape((-1, 832, 64));  l_key_layer_ = None
	        transpose_5: "bf16[12, 64, 832][64, 1, 768]cpu" = reshape_25.transpose(1, 2);  reshape_25 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_10: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.bmm(reshape_24, transpose_5);  reshape_24 = transpose_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        last_product: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = bmm_10.view((1, 12, 64, 832));  bmm_10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
	        last_product_1: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product * 0.125;  last_product = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
	        sub_7: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = 1.0 - l_to_mask_;  l_to_mask_ = None
	        mul_17: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = sub_7 * -10000.0;  sub_7 = None
	        last_product_1 += mul_17;  last_product_2: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = last_product_1;  last_product_1 = mul_17 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1)  # [bsz, n_heads, from_block_size, n]
	        last_attn_weights: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.nn.functional.softmax(last_product_2, dim = -1);  last_product_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        reshape_26: "bf16[12, 64, 832][53248, 832, 1]cpu" = last_attn_weights.reshape((-1, 64, 832));  last_attn_weights = None
	        reshape_27: "bf16[12, 832, 64][64, 768, 1]cpu" = l_value_layer_.reshape((-1, 832, 64));  l_value_layer_ = None
	        bmm_11: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.bmm(reshape_26, reshape_27);  reshape_26 = reshape_27 = None
	        last_context_layer: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = bmm_11.view((1, 12, 64, 64));  bmm_11 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
	        unsqueeze__4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = last_context_layer.unsqueeze_(2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
	        context_layer_4: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.cat([first_context_layer, second_context_layer, context_layer_3, second_last_context_layer, last_context_layer], dim = 2);  first_context_layer = second_context_layer = context_layer_3 = second_last_context_layer = last_context_layer = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
	        view_20: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = context_layer_4.view((1, 12, 832, -1));  context_layer_4 = None
	        context_layer_5: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = view_20 * l_from_mask_;  view_20 = l_from_mask_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
	        context_layer_6: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.transpose(context_layer_5, 1, 2);  context_layer_5 = None
	        return (context_layer_6, rand_attn_2)

V0627 17:31:05.175000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "9d92a7e58f208e3c617d3e5fb4f3ee25"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
	        cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]);  arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
	        view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.view.default(cat, [12, 11, 3]);  cat = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
	        alias: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(view);  view = None
	        alias_1: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias);  alias = None
	        alias_2: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.alias.default(alias_1);  alias_1 = None
	        convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(alias_2, torch.int64);  alias_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
	        unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0);  convert_element_type = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0)
	        clone: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.clone.default(unsqueeze);  unsqueeze = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0)
	        select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(clone, 0, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        view_1: "i64[396][1]cpu" = torch.ops.aten.view.default(select_1, [396]);  select_1 = None
	        index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]);  select = view_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        clone_1: "f32[396, 64][64, 1]cpu" = torch.ops.aten.clone.default(index);  index = None
	        view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.view.default(clone_1, [1, 396, 64]);  clone_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
	        view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.view.default(view_2, [1, 12, 11, 192]);  view_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
	        slice_1: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 0, 0, 9223372036854775807);  arg13_1 = None
	        slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 1, -1);  slice_1 = None
	        unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3);  slice_2 = None
	        unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4);  unsqueeze_1 = None
	        permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]);  unsqueeze_2 = None
	        unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4);  view_3 = None
	        permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]);  unsqueeze_3 = None
	        mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
	        view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg12_1, [1, 12, 13, 64, -1]);  arg12_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
	        view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [1, 12, 13, 64, -1])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
	        view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [1, 12, 13, 64, -1])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
	        iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
	        div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor');  iota = None
	        mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13);  div = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
	        view_7: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1])
	        add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1);  view_7 = mul_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
	        clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
	        view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_2, [156, 64, 64]);  clone_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
	        index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]);  view_8 = add = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
	        view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_1, [1, 12, 33, 64, 64]);  index_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
	        view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_9, [1, 12, 11, 192, -1]);  view_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
	        iota_1: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
	        div_1: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota_1, 33, rounding_mode = 'floor');  iota_1 = None
	        mul_2: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div_1, 13);  div_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
	        view_11: "i64[396][1]cpu" = torch.ops.aten.view.default(clone, [-1])
	        add_1: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_11, mul_2);  view_11 = mul_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
	        clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
	        view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_3, [156, 64, 64]);  clone_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
	        index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add_1]);  view_12 = add_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
	        view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.view.default(index_2, [1, 12, 33, 64, 64]);  index_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
	        view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.view.default(view_13, [1, 12, 11, 192, -1]);  view_13 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
	        slice_3: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
	        slice_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807);  slice_3 = None
	        select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_4, 2, 0);  slice_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_2, [12, 64, 64]);  select_2 = None
	        view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64])
	        permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]);  view_16 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2);  view_15 = permute_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm, [1, 12, 64, 832]);  bmm = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
	        mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125);  view_17 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
	        sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
	        mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0);  sub = None
	        add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4);  mul_3 = mul_4 = None
	        convert_element_type_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_2, torch.bfloat16);  add_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
	        convert_element_type_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_3, torch.float32);  convert_element_type_3 = None
	        amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_4, [-1], True)
	        sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_4, amax);  convert_element_type_4 = amax = None
	        exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1);  sub_1 = None
	        sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
	        div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1);  exp = sum_1 = None
	        convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16);  div_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_5, [-1, 64, 832]);  convert_element_type_5 = None
	        view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64])
	        bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19);  view_18 = view_19 = None
	        view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_1, [1, 12, 64, 64]);  bmm_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
	        unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2);  view_20 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
	        slice_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807);  slice_5 = None
	        select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_6, 2, 0);  slice_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
	        slice_7: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_8: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807);  slice_7 = None
	        select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_8, 2, 1);  slice_8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
	        slice_9: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_10: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807);  slice_9 = None
	        select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_10, 2, 2);  slice_10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
	        slice_11: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_12: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807);  slice_11 = None
	        select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_12, 2, -1);  slice_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
	        slice_13: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807)
	        slice_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807);  slice_13 = None
	        select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_14, 2, 0);  slice_14 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
	        cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2);  select_3 = select_4 = select_5 = select_6 = select_7 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
	        slice_15: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_16: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807);  slice_15 = None
	        select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_16, 2, 0);  slice_16 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
	        slice_17: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_18: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807);  slice_17 = None
	        select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_18, 2, 1);  slice_18 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
	        slice_19: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_20: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807);  slice_19 = None
	        select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_20, 2, 2);  slice_20 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
	        slice_21: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_22: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807);  slice_21 = None
	        select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_22, 2, -1);  slice_22 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
	        slice_23: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807)
	        slice_24: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807);  slice_23 = None
	        select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_24, 2, 0);  slice_24 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
	        cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2);  select_8 = select_9 = select_10 = select_11 = select_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
	        slice_25: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
	        slice_26: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_25, 1, 0, 9223372036854775807);  slice_25 = None
	        select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_26, 2, 1);  slice_26 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_13, [12, 64, 64]);  select_13 = None
	        view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_1, [-1, 448, 64]);  cat_1 = None
	        permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]);  view_22 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3);  view_21 = permute_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_2, [1, 12, 64, 448]);  bmm_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
	        slice_27: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
	        slice_28: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_27, 1, 0, 9223372036854775807);  slice_27 = None
	        slice_29: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_28, 2, 0, 9223372036854775807);  slice_28 = None
	        slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_29, 3, 0, 192);  slice_29 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
	        slice_31: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
	        slice_32: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_31, 1, 0, 9223372036854775807);  slice_31 = None
	        slice_33: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_32, 2, 0, 9223372036854775807);  slice_32 = None
	        slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_33, 3, -64, 9223372036854775807);  slice_33 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
	        full: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
	        cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full], 3);  slice_30 = slice_34 = full = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
	        full_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
	        slice_35: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807)
	        slice_36: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_35, 1, 0, 9223372036854775807);  slice_35 = None
	        select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(slice_36, 2, 0);  slice_36 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
	        cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_1, select_14], 3);  full_1 = select_14 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
	        mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125);  view_23 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
	        minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4);  cat_3 = cat_4 = None
	        sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum);  minimum = None
	        mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0);  sub_2 = None
	        add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6);  mul_5 = mul_6 = None
	        convert_element_type_10: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_3, torch.bfloat16);  add_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
	        convert_element_type_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_10, torch.float32);  convert_element_type_10 = None
	        amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_11, [-1], True)
	        sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_11, amax_1);  convert_element_type_11 = amax_1 = None
	        exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3);  sub_3 = None
	        sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
	        div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2);  exp_1 = sum_2 = None
	        convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16);  div_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_12, [-1, 64, 448]);  convert_element_type_12 = None
	        view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_2, [-1, 448, 64]);  cat_2 = None
	        bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25);  view_24 = view_25 = None
	        view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_3, [1, 12, 64, 64]);  bmm_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
	        unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2);  view_26 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
	        slice_37: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_38: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_37, 1, 0, 9223372036854775807);  slice_37 = None
	        slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_38, 2, 1, -3);  slice_38 = None
	        slice_40: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_41: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_40, 1, 0, 9223372036854775807);  slice_40 = None
	        slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_41, 2, 2, -2);  slice_41 = None
	        slice_43: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_44: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_43, 1, 0, 9223372036854775807);  slice_43 = None
	        slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_44, 2, 3, -1);  slice_44 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
	        cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3);  slice_39 = slice_42 = slice_45 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
	        slice_46: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_47: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_46, 1, 0, 9223372036854775807);  slice_46 = None
	        slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_47, 2, 1, -3);  slice_47 = None
	        slice_49: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_50: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_49, 1, 0, 9223372036854775807);  slice_49 = None
	        slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_50, 2, 2, -2);  slice_50 = None
	        slice_52: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_53: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_52, 1, 0, 9223372036854775807);  slice_52 = None
	        slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_53, 2, 3, -1);  slice_53 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
	        cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3);  slice_48 = slice_51 = slice_54 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
	        slice_55: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
	        slice_56: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_55, 1, 0, 9223372036854775807);  slice_55 = None
	        slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_56, 2, 2, -2);  slice_56 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
	        view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_4, [108, 64, 64]);  clone_4 = None
	        view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_5, [-1, 192, 64]);  cat_5 = None
	        permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]);  view_28 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4);  view_27 = permute_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_4, [1, 12, 9, 64, 192]);  bmm_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
	        mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125);  view_29 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
	        slice_58: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807)
	        slice_59: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_58, 1, 0, 9223372036854775807);  slice_58 = None
	        slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_59, 2, 1, -1);  slice_59 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        clone_5: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
	        view_30: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(clone_5, [108, 64, 64]);  clone_5 = None
	        clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format);  slice_60 = None
	        view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_6, [108, 192, 64]);  clone_6 = None
	        permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]);  view_31 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_30, permute_5);  view_30 = permute_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.view.default(bmm_5, [1, 12, 9, 64, 192]);  bmm_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
	        mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125);  view_32 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:718 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]
	        slice_61: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_62: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_61, 1, 0, 9223372036854775807);  slice_61 = None
	        select_15: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_62, 2, 0);  slice_62 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
	        unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
	        permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]);  unsqueeze_6 = None
	        unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_15, 4);  select_15 = None
	        unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5);  unsqueeze_7 = None
	        permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]);  unsqueeze_8 = None
	        permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]);  permute_6 = None
	        view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_8, [12, 576, 64]);  permute_8 = None
	        permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]);  permute_7 = None
	        view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_9, [12, 64, 64]);  permute_9 = None
	        bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34);  view_33 = view_34 = None
	        view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_6, [12, 9, 64, 1, 1, 64]);  bmm_6 = None
	        permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]);  view_35 = None
	        view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_10, [1, 12, 9, 64, 64]);  permute_10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
	        mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125);  view_36 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:724 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]
	        slice_63: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_64: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_63, 1, 0, 9223372036854775807);  slice_63 = None
	        select_16: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_64, 2, -1);  slice_64 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
	        unsqueeze_9: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5);  slice_57 = None
	        permute_11: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_9, [0, 1, 2, 3, 5, 4]);  unsqueeze_9 = None
	        unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_16, 4);  select_16 = None
	        unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5);  unsqueeze_10 = None
	        permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]);  unsqueeze_11 = None
	        permute_13: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_11, [1, 2, 3, 5, 0, 4]);  permute_11 = None
	        view_37: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_13, [12, 576, 64]);  permute_13 = None
	        permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]);  permute_12 = None
	        view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.view.default(permute_14, [12, 64, 64]);  permute_14 = None
	        bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_37, view_38);  view_37 = view_38 = None
	        view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_7, [12, 9, 64, 1, 1, 64]);  bmm_7 = None
	        permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]);  view_39 = None
	        view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_15, [1, 12, 9, 64, 64]);  permute_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
	        mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125);  view_40 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
	        sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1);  arg17_1 = None
	        mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0);  sub_4 = None
	        add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11);  mul_7 = mul_11 = None
	        convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16);  add_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
	        slice_65: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
	        slice_66: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_65, 1, 0, 9223372036854775807);  slice_65 = None
	        slice_67: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_66, 2, 0, 9223372036854775807);  slice_66 = None
	        slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_67, 3, 0, 64);  slice_67 = None
	        unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3);  slice_68 = None
	        sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12);  unsqueeze_12 = None
	        mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0);  sub_5 = None
	        add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12);  mul_9 = mul_12 = None
	        convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16);  add_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
	        slice_69: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
	        slice_70: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_69, 1, 0, 9223372036854775807);  slice_69 = None
	        slice_71: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_70, 2, 0, 9223372036854775807);  slice_70 = None
	        slice_72: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_71, 3, -64, 9223372036854775807);  slice_71 = None
	        unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_72, 3);  slice_72 = None
	        sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13);  unsqueeze_13 = None
	        mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0);  sub_6 = None
	        add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13);  mul_10 = mul_13 = None
	        convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16);  add_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
	        slice_73: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807)
	        slice_74: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_73, 1, 0, 9223372036854775807);  slice_73 = None
	        slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_74, 2, 1, -1);  slice_74 = None
	        sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75);  slice_75 = None
	        mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0);  sub_7 = None
	        add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14);  mul_8 = mul_14 = None
	        convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16);  add_7 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
	        cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1);  convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
	        convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32);  cat_7 = None
	        amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
	        sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2);  convert_element_type_27 = amax_2 = None
	        exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8);  sub_8 = None
	        sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
	        div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3);  exp_2 = sum_3 = None
	        convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16);  div_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
	        slice_76: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
	        slice_77: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_76, 1, 0, 9223372036854775807);  slice_76 = None
	        slice_78: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_77, 2, 0, 9223372036854775807);  slice_77 = None
	        slice_79: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_78, 3, 0, 9223372036854775807);  slice_78 = None
	        slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_79, 4, 64, 256);  slice_79 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_80, [108, 64, 192]);  slice_80 = None
	        view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(cat_6, [-1, 192, 64]);  cat_6 = None
	        bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42);  view_41 = view_42 = None
	        view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_8, [1, 12, 9, 64, 64]);  bmm_8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
	        slice_81: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
	        slice_82: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_81, 1, 0, 9223372036854775807);  slice_81 = None
	        slice_83: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_82, 2, 0, 9223372036854775807);  slice_82 = None
	        slice_84: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_83, 3, 0, 9223372036854775807);  slice_83 = None
	        slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_84, 4, 256, -64);  slice_84 = None
	        slice_86: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807)
	        slice_87: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_86, 1, 0, 9223372036854775807);  slice_86 = None
	        slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_87, 2, 1, -1);  slice_87 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.view.default(slice_85, [108, 64, 192]);  slice_85 = None
	        clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format);  slice_88 = None
	        view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.view.default(clone_7, [108, 192, 64]);  clone_7 = None
	        bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45);  view_44 = view_45 = None
	        view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_9, [1, 12, 9, 64, 64]);  bmm_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
	        add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46);  view_43 = view_46 = None
	        view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_8, [108, 64, 64]);  add_8 = None
	        view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_47, [1, 12, 9, 64, 64]);  view_47 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
	        slice_89: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807)
	        slice_90: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_89, 1, 0, 9223372036854775807);  slice_89 = None
	        slice_91: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_90, 2, 0, 9223372036854775807);  slice_90 = None
	        slice_92: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_91, 3, 0, 9223372036854775807);  slice_91 = None
	        slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_92, 4, 0, 64);  slice_92 = None
	        slice_94: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_95: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_94, 1, 0, 9223372036854775807);  slice_94 = None
	        select_17: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_95, 2, 0);  slice_95 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
	        unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5);  slice_93 = None
	        permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]);  unsqueeze_14 = None
	        unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_17, 4);  select_17 = None
	        unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5);  unsqueeze_15 = None
	        permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]);  unsqueeze_16 = None
	        permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]);  permute_16 = None
	        view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_18, [12, 576, 64]);  permute_18 = None
	        permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]);  permute_17 = None
	        view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_19, [12, 64, 64]);  permute_19 = None
	        bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50);  view_49 = view_50 = None
	        view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_10, [12, 9, 64, 1, 1, 64]);  bmm_10 = None
	        permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]);  view_51 = None
	        view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_20, [1, 12, 9, 64, 64]);  permute_20 = None
	        add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52);  view_48 = view_52 = None
	        view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_9, [108, 64, 64]);  add_9 = None
	        view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_53, [1, 12, 9, 64, 64]);  view_53 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
	        slice_96: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 0, 0, 9223372036854775807);  convert_element_type_28 = None
	        slice_97: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_96, 1, 0, 9223372036854775807);  slice_96 = None
	        slice_98: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_97, 2, 0, 9223372036854775807);  slice_97 = None
	        slice_99: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_98, 3, 0, 9223372036854775807);  slice_98 = None
	        slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(slice_99, 4, -64, 9223372036854775807);  slice_99 = None
	        slice_101: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_102: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_101, 1, 0, 9223372036854775807);  slice_101 = None
	        select_18: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_102, 2, -1);  slice_102 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
	        unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5);  slice_100 = None
	        permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]);  unsqueeze_17 = None
	        unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_18, 4);  select_18 = None
	        unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5);  unsqueeze_18 = None
	        permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]);  unsqueeze_19 = None
	        permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]);  permute_21 = None
	        view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.view.default(permute_23, [12, 576, 64]);  permute_23 = None
	        permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]);  permute_22 = None
	        view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(permute_24, [12, 64, 64]);  permute_24 = None
	        bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56);  view_55 = view_56 = None
	        view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.view.default(bmm_11, [12, 9, 64, 1, 1, 64]);  bmm_11 = None
	        permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]);  view_57 = None
	        view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(permute_25, [1, 12, 9, 64, 64]);  permute_25 = None
	        add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58);  view_54 = view_58 = None
	        view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.view.default(add_10, [108, 64, 64]);  add_10 = None
	        view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.view.default(view_59, [1, 12, 9, 64, 64]);  view_59 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:775 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
	        slice_103: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_104: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_103, 1, 0, 9223372036854775807);  slice_103 = None
	        select_19: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_104, 2, 0);  slice_104 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
	        slice_105: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_106: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_105, 1, 0, 9223372036854775807);  slice_105 = None
	        select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_106, 2, -3);  slice_106 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
	        slice_107: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807)
	        slice_108: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_107, 1, 0, 9223372036854775807);  slice_107 = None
	        select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_108, 2, -2);  slice_108 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:778 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
	        slice_109: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 0, 0, 9223372036854775807);  view_5 = None
	        slice_110: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_109, 1, 0, 9223372036854775807);  slice_109 = None
	        select_22: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_110, 2, -1);  slice_110 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
	        slice_111: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 0, 0, 9223372036854775807);  view_10 = None
	        slice_112: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_111, 1, 0, 9223372036854775807);  slice_111 = None
	        select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_112, 2, -1);  slice_112 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
	        cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_19, select_20, select_21, select_22, select_23], 2);  select_19 = select_20 = select_21 = select_22 = select_23 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:785 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
	        slice_113: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_114: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_113, 1, 0, 9223372036854775807);  slice_113 = None
	        select_24: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_114, 2, 0);  slice_114 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
	        slice_115: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_116: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_115, 1, 0, 9223372036854775807);  slice_115 = None
	        select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_116, 2, -3);  slice_116 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
	        slice_117: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807)
	        slice_118: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_117, 1, 0, 9223372036854775807);  slice_117 = None
	        select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_118, 2, -2);  slice_118 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:788 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
	        slice_119: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 0, 0, 9223372036854775807);  view_6 = None
	        slice_120: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_119, 1, 0, 9223372036854775807);  slice_119 = None
	        select_27: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_120, 2, -1);  slice_120 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
	        slice_121: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 0, 0, 9223372036854775807);  view_14 = None
	        slice_122: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(slice_121, 1, 0, 9223372036854775807);  slice_121 = None
	        select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(slice_122, 2, -1);  slice_122 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
	        cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_24, select_25, select_26, select_27, select_28], 2);  select_24 = select_25 = select_26 = select_27 = select_28 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
	        slice_123: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807)
	        slice_124: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_123, 1, 0, 9223372036854775807);  slice_123 = None
	        select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_124, 2, -2);  slice_124 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_29, [12, 64, 64]);  select_29 = None
	        view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_8, [-1, 448, 64]);  cat_8 = None
	        permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]);  view_62 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26);  view_61 = permute_26 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.view.default(bmm_12, [1, 12, 64, 448]);  bmm_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:798 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, :to_block_size],
	        slice_125: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
	        slice_126: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_125, 1, 0, 9223372036854775807);  slice_125 = None
	        slice_127: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_126, 2, 0, 9223372036854775807);  slice_126 = None
	        slice_128: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_127, 3, 0, 64);  slice_127 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
	        slice_129: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 0, 0, 9223372036854775807)
	        slice_130: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_129, 1, 0, 9223372036854775807);  slice_129 = None
	        slice_131: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_130, 2, 0, 9223372036854775807);  slice_130 = None
	        slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(slice_131, 3, -192, 9223372036854775807);  slice_131 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:800 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
	        full_2: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
	        cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_128, slice_132, full_2], 3);  slice_128 = slice_132 = full_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:806 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
	        full_3: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
	        slice_133: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 0, 0, 9223372036854775807);  mul = None
	        slice_134: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(slice_133, 1, 0, 9223372036854775807);  slice_133 = None
	        select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(slice_134, 2, -1);  slice_134 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
	        cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_3, select_30], 3);  full_3 = select_30 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
	        mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125);  view_63 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
	        minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11);  cat_10 = cat_11 = None
	        sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1);  minimum_1 = None
	        mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0);  sub_9 = None
	        add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16);  mul_15 = mul_16 = None
	        convert_element_type_39: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(add_11, torch.bfloat16);  add_11 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
	        convert_element_type_40: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_39, torch.float32);  convert_element_type_39 = None
	        amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_40, [-1], True)
	        sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_40, amax_3);  convert_element_type_40 = amax_3 = None
	        exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10);  sub_10 = None
	        sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
	        div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4);  exp_3 = sum_4 = None
	        convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16);  div_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.view.default(convert_element_type_41, [-1, 64, 448]);  convert_element_type_41 = None
	        view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.view.default(cat_9, [-1, 448, 64]);  cat_9 = None
	        bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65);  view_64 = view_65 = None
	        view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_13, [1, 12, 64, 64]);  bmm_13 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
	        unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2);  view_66 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
	        slice_135: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 0, 0, 9223372036854775807);  view_4 = None
	        slice_136: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(slice_135, 1, 0, 9223372036854775807);  slice_135 = None
	        select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(slice_136, 2, -1);  slice_136 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(select_31, [12, 64, 64]);  select_31 = None
	        view_68: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg14_1, [12, 832, 64]);  arg14_1 = None
	        permute_27: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_68, [0, 2, 1]);  view_68 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_27);  view_67 = permute_27 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.view.default(bmm_14, [1, 12, 64, 832]);  bmm_14 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
	        mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125);  view_69 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
	        sub_11: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1);  arg16_1 = None
	        mul_18: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub_11, -10000.0);  sub_11 = None
	        add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_18);  mul_17 = mul_18 = None
	        convert_element_type_46: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(add_12, torch.bfloat16);  add_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1)  # [bsz, n_heads, from_block_size, n]
	        convert_element_type_47: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(convert_element_type_46, torch.float32);  convert_element_type_46 = None
	        amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_47, [-1], True)
	        sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_47, amax_4);  convert_element_type_47 = amax_4 = None
	        exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12);  sub_12 = None
	        sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
	        div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5);  exp_4 = sum_5 = None
	        convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16);  div_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.view.default(convert_element_type_48, [-1, 64, 832]);  convert_element_type_48 = None
	        view_71: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.view.default(arg15_1, [12, 832, 64]);  arg15_1 = None
	        bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_71);  view_70 = view_71 = None
	        view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.view.default(bmm_15, [1, 12, 64, 64]);  bmm_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
	        unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2);  view_72 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
	        cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2);  unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
	        view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.view.default(cat_12, [1, 12, 832, -1]);  cat_12 = None
	        mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1);  view_73 = arg18_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
	        permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]);  mul_19 = None
	        return (permute_28, clone)

V0627 17:31:05.639000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "ee4f5da4b7396f62d53589c7ddc358c5"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
	        view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]);  arg12_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
	        select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]);  select_2 = None
	        view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64])
	        permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]);  view_16 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2);  view_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]);  bmm = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
	        mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125);  view_17 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
	        sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
	        mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0);  sub = None
	        add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4);  mul_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
	        amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_2, [-1], True)
	        sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, amax);  add_2 = amax = None
	        exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1);  sub_1 = None
	        sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
	        div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1);  exp = sum_1 = None
	        convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16);  div_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]);  convert_element_type_5 = None
	        view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64])
	        bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19);  view_18 = None
	        view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]);  bmm_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
	        unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2);  view_20 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
	        select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]);  select_13 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
	        view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1]);  arg14_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
	        select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
	        select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
	        select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
	        select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
	        clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
	        view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]);  clone_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
	        cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]);  arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
	        view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.reshape.default(cat, [12, 11, 3]);  cat = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
	        convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(view, torch.int64);  view = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
	        unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0);  convert_element_type = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
	        view_7: "i64[396][1]cpu" = torch.ops.aten.reshape.default(unsqueeze, [-1])

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
	        iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
	        div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor');  iota = None
	        mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13);  div = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
	        add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1);  view_7 = mul_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
	        index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]);  view_8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
	        view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]);  index_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
	        view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]);  view_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
	        select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
	        cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2);  select_4 = select_5 = select_7 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]);  cat_1 = None
	        permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]);  view_22 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3);  view_21 = permute_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]);  bmm_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
	        mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125);  view_23 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
	        slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
	        slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
	        full_default: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
	        cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full_default], 3);  slice_30 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
	        full_default_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
	        slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1)
	        unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3);  slice_2 = None
	        unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4);  unsqueeze_1 = None
	        permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]);  unsqueeze_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0);  arg13_1 = None
	        select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(unsqueeze, 0, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        view_1: "i64[396][1]cpu" = torch.ops.aten.reshape.default(select_1, [396]);  select_1 = None
	        index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]);  select = view_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.reshape.default(index, [1, 396, 64]);  index = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
	        view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]);  view_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
	        unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4);  view_3 = None
	        permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]);  unsqueeze_3 = None
	        mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
	        select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
	        cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_14], 3);  select_14 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
	        minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4);  cat_3 = cat_4 = None
	        sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum);  minimum = None
	        mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0);  sub_2 = None
	        add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6);  mul_5 = mul_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
	        amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_3, [-1], True)
	        sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_3, amax_1);  add_3 = amax_1 = None
	        exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3);  sub_3 = None
	        sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
	        div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2);  exp_1 = sum_2 = None
	        convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16);  div_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]);  convert_element_type_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
	        view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1]);  arg15_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
	        select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
	        select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
	        select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
	        select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
	        clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
	        view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]);  clone_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
	        index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add]);  view_12 = add = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
	        view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]);  index_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
	        view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]);  view_13 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
	        select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, 0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
	        cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2);  select_9 = select_10 = select_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]);  cat_2 = None
	        bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25);  view_24 = view_25 = None
	        view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]);  bmm_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
	        unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2);  view_26 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
	        slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
	        unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
	        permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]);  unsqueeze_6 = None
	        permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]);  permute_6 = None
	        view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]);  permute_8 = None
	        unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_3, 4)
	        unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5);  unsqueeze_7 = None
	        permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]);  unsqueeze_8 = None
	        permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]);  permute_7 = None
	        view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]);  permute_9 = None
	        bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34);  view_34 = None
	        view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]);  bmm_6 = None
	        permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]);  view_35 = None
	        view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]);  permute_10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
	        mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125);  view_36 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
	        slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64)
	        unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3)
	        sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12);  unsqueeze_12 = None
	        mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0);  sub_5 = None
	        add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12);  mul_9 = mul_12 = None
	        convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16);  add_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format);  slice_57 = None
	        view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]);  clone_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
	        slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3)
	        slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2)
	        slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
	        cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3);  slice_39 = slice_42 = slice_45 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]);  cat_5 = None
	        permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]);  view_28 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4);  permute_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]);  bmm_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
	        mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125);  view_29 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
	        sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1);  arg17_1 = None
	        mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0);  sub_4 = None
	        add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11);  mul_7 = mul_11 = None
	        convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16);  add_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
	        slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format);  slice_60 = None
	        view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]);  clone_6 = None
	        permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]);  view_31 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_5);  view_27 = permute_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]);  bmm_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
	        mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125);  view_32 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
	        slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1)
	        sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75);  slice_75 = None
	        mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0);  sub_7 = None
	        add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14);  mul_8 = mul_14 = None
	        convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16);  add_7 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
	        unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_6, 4)
	        unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5);  unsqueeze_10 = None
	        permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]);  unsqueeze_11 = None
	        permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]);  permute_12 = None
	        view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]);  permute_14 = None
	        bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_38);  view_33 = view_38 = None
	        view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]);  bmm_7 = None
	        permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]);  view_39 = None
	        view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]);  permute_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
	        mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125);  view_40 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
	        unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_34, 3);  slice_34 = None
	        sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13);  unsqueeze_13 = None
	        mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0);  sub_6 = None
	        add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13);  mul_10 = mul_13 = None
	        convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16);  add_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
	        cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1);  convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
	        convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32);  cat_7 = None
	        amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
	        sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2);  convert_element_type_27 = amax_2 = None
	        exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8);  sub_8 = None
	        sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
	        div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3);  exp_2 = sum_3 = None
	        convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16);  div_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
	        slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]);  slice_80 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
	        slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3)
	        slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2)
	        slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
	        cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3);  slice_48 = slice_51 = slice_54 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]);  cat_6 = None
	        bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42);  view_41 = view_42 = None
	        view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]);  bmm_8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
	        slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]);  slice_85 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
	        slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format);  slice_88 = None
	        view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]);  clone_7 = None
	        bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45);  view_44 = view_45 = None
	        view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]);  bmm_9 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
	        add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46);  view_43 = view_46 = None
	        view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]);  add_8 = None
	        view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]);  view_47 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
	        slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
	        unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5);  slice_93 = None
	        permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]);  unsqueeze_14 = None
	        permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]);  permute_16 = None
	        view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]);  permute_18 = None
	        unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_8, 4)
	        unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5);  unsqueeze_15 = None
	        permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]);  unsqueeze_16 = None
	        permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]);  permute_17 = None
	        view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]);  permute_19 = None
	        bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50);  view_49 = view_50 = None
	        view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]);  bmm_10 = None
	        permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]);  view_51 = None
	        view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]);  permute_20 = None
	        add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52);  view_48 = view_52 = None
	        view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]);  add_9 = None
	        view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]);  view_53 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
	        slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807);  convert_element_type_28 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
	        unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5);  slice_100 = None
	        permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]);  unsqueeze_17 = None
	        permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]);  permute_21 = None
	        view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]);  permute_23 = None
	        unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_11, 4)
	        unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5);  unsqueeze_18 = None
	        permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]);  unsqueeze_19 = None
	        permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]);  permute_22 = None
	        view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]);  permute_24 = None
	        bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56);  view_55 = view_56 = None
	        view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]);  bmm_11 = None
	        permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]);  view_57 = None
	        view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]);  permute_25 = None
	        add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58);  view_54 = view_58 = None
	        view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]);  add_10 = None
	        view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]);  view_59 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
	        select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -2)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]);  select_29 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
	        select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -3)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
	        select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -2);  view_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
	        select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, -1);  view_10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
	        cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_20, select_21, select_6, select_23], 2);  select_3 = select_20 = select_21 = select_6 = select_23 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]);  cat_8 = None
	        permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]);  view_62 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26);  view_61 = permute_26 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]);  bmm_12 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
	        mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125);  view_63 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
	        slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807);  arg16_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
	        cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_68, slice_132, full_default], 3);  slice_68 = slice_132 = full_default = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
	        select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, -1);  mul = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
	        cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_30], 3);  full_default_1 = select_30 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
	        minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11);  cat_10 = cat_11 = None
	        sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1);  minimum_1 = None
	        mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0);  sub_9 = None
	        add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16);  mul_15 = mul_16 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
	        amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_11, [-1], True)
	        sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_11, amax_3);  add_11 = amax_3 = None
	        exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10);  sub_10 = None
	        sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
	        div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4);  exp_3 = sum_4 = None
	        convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16);  div_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]);  convert_element_type_41 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
	        select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -3)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
	        select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -2);  view_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
	        select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, -1);  view_14 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
	        cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_25, select_26, select_11, select_28], 2);  select_8 = select_25 = select_26 = select_11 = select_28 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]);  cat_9 = None
	        bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65);  view_64 = view_65 = None
	        view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]);  bmm_13 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
	        unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2);  view_66 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
	        select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -1);  view_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
	        view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]);  select_31 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
	        bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_2);  view_67 = permute_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
	        view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]);  bmm_14 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
	        mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125);  view_69 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
	        add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_4);  mul_17 = mul_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1)  # [bsz, n_heads, from_block_size, n]
	        amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_12, [-1], True)
	        sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_12, amax_4);  add_12 = amax_4 = None
	        exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12);  sub_12 = None
	        sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
	        div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5);  exp_4 = sum_5 = None
	        convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16);  div_6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
	        view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]);  convert_element_type_48 = None
	        bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_19);  view_70 = view_19 = None
	        view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]);  bmm_15 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
	        unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2);  view_72 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
	        cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2);  unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
	        view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]);  cat_12 = None
	        mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1);  view_73 = arg18_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
	        permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]);  mul_19 = None
	        return (permute_28, unsqueeze)

V0627 17:31:09.541000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/6i/c6icdm2jkh5xkxrgpyz2vtbd5oehca45dznneh7n63f3sirkkptn.py"}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "47217ba55691917867319806954aafb8"}

	# AOT ID: ['8_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()


	cpp_fused__softmax_add_mul_rsub_0 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       const float* in_ptr1,
	                       float* out_ptr0,
	                       float* out_ptr1,
	                       float* out_ptr2,
	                       bfloat16* out_ptr3)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
	            {
	                {
	                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp7 = static_cast<float>(1.0);
	                        auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                        auto tmp9 = tmp8 - tmp6;
	                        auto tmp10 = static_cast<float>(-10000.0);
	                        auto tmp11 = at::vec::Vectorized<float>(tmp10);
	                        auto tmp12 = tmp9 * tmp11;
	                        auto tmp13 = tmp5 + tmp12;
	                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13);
	                    }
	                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
	                    out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                {
	                    float tmp_acc0 = 0;
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
	                        auto tmp14 = out_ptr0[static_cast<long>(x0)];
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp7 = static_cast<float>(1.0);
	                        auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                        auto tmp9 = tmp8 - tmp6;
	                        auto tmp10 = static_cast<float>(-10000.0);
	                        auto tmp11 = at::vec::Vectorized<float>(tmp10);
	                        auto tmp12 = tmp9 * tmp11;
	                        auto tmp13 = tmp5 + tmp12;
	                        auto tmp15 = at::vec::Vectorized<float>(tmp14);
	                        auto tmp16 = tmp13 - tmp15;
	                        auto tmp17 = tmp16.exp();
	                        tmp17.store(out_ptr1 + static_cast<long>(x1 + (832L*x0)));
	                        tmp_acc0_vec = tmp_acc0_vec + tmp17;
	                    }
	                    tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
	                    out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16);
	                    auto tmp1 = out_ptr2[static_cast<long>(x0)];
	                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
	                    auto tmp3 = tmp0 / tmp2;
	                    auto tmp4 = at::vec::convert<bfloat16>(tmp3);
	                    tmp4.store(out_ptr3 + static_cast<long>(x1 + (832L*x0)), 16);
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused__to_copy_cat_stack_1 = async_compile.cpp_pybinding(['const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const int32_t*', 'const bfloat16*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int32_t*', 'int64_t*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const int32_t* in_ptr0,
	                       const int32_t* in_ptr1,
	                       const int32_t* in_ptr2,
	                       const int32_t* in_ptr3,
	                       const int32_t* in_ptr4,
	                       const int32_t* in_ptr5,
	                       const int32_t* in_ptr6,
	                       const int32_t* in_ptr7,
	                       const int32_t* in_ptr8,
	                       const int32_t* in_ptr9,
	                       const int32_t* in_ptr10,
	                       const int32_t* in_ptr11,
	                       const int32_t* in_ptr12,
	                       const bfloat16* in_ptr13,
	                       int32_t* out_ptr0,
	                       int32_t* out_ptr1,
	                       int32_t* out_ptr2,
	                       int32_t* out_ptr3,
	                       int32_t* out_ptr4,
	                       int32_t* out_ptr5,
	                       int32_t* out_ptr6,
	                       int32_t* out_ptr7,
	                       int32_t* out_ptr8,
	                       int32_t* out_ptr9,
	                       int32_t* out_ptr10,
	                       int32_t* out_ptr11,
	                       int64_t* out_ptr12,
	                       bfloat16* out_ptr13,
	                       bfloat16* out_ptr14,
	                       bfloat16* out_ptr15,
	                       bfloat16* out_ptr16,
	                       bfloat16* out_ptr17,
	                       bfloat16* out_ptr18,
	                       bfloat16* out_ptr19,
	                       bfloat16* out_ptr20)
	{
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr0 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr0 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr0[static_cast<long>(x0)];
	            out_ptr0[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr1 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr1 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr1[static_cast<long>(x0)];
	            out_ptr1[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr2 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr2 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr2[static_cast<long>(x0)];
	            out_ptr2[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr3 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr3 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr3[static_cast<long>(x0)];
	            out_ptr3[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr4 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr4 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr4[static_cast<long>(x0)];
	            out_ptr4[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr5 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr5 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr5[static_cast<long>(x0)];
	            out_ptr5[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr6 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr6 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr6[static_cast<long>(x0)];
	            out_ptr6[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr7 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr7 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr7[static_cast<long>(x0)];
	            out_ptr7[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr8 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr8 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr8[static_cast<long>(x0)];
	            out_ptr8[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr9 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr9 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr9[static_cast<long>(x0)];
	            out_ptr9[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr10 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr10 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr10[static_cast<long>(x0)];
	            out_ptr10[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr11 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr11 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(32L); x0<static_cast<long>(33L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr11[static_cast<long>(x0)];
	            out_ptr11[static_cast<long>(x0)] = tmp0;
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(384L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr12 + static_cast<long>(x0), 16);
	            auto tmp1 = at::vec::convert<int64_t,2,int32_t,1>(tmp0);
	            tmp1.store(out_ptr12 + static_cast<long>(x0), 16);
	        }
	        #pragma omp simd simdlen(8)
	        for(long x0=static_cast<long>(384L); x0<static_cast<long>(396L); x0+=static_cast<long>(1L))
	        {
	            auto tmp0 = in_ptr12[static_cast<long>(x0)];
	            auto tmp1 = c10::convert<int64_t>(tmp0);
	            out_ptr12[static_cast<long>(x0)] = tmp1;
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	            {
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32);
	                    tmp0.store(out_ptr13 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                    tmp0.store(out_ptr14 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                }
	            }
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	            {
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32);
	                    tmp0.store(out_ptr15 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                }
	            }
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	            {
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32);
	                    tmp0.store(out_ptr16 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                }
	            }
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	            {
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr13 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32);
	                    tmp0.store(out_ptr17 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                    tmp0.store(out_ptr18 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                }
	            }
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                {
	                    auto tmp0 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))];
	                    auto tmp13 = out_ptr12[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))];
	                    auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L));
	                    auto tmp2 = c10::convert<int64_t>(tmp1);
	                    auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
	                    auto tmp4 = 156L;
	                    auto tmp5 = c10::convert<int64_t>(tmp4);
	                    auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
	                    auto tmp7 = tmp3 < 0;
	                    auto tmp8 = tmp7 ? tmp6 : tmp3;
	                    auto tmp9 = tmp8;
	                    auto tmp10 = c10::convert<int64_t>(tmp9);
	                    TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
	                    auto tmp12 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
	                    auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L));
	                    auto tmp15 = c10::convert<int64_t>(tmp14);
	                    auto tmp16 = decltype(tmp13)(tmp13 + tmp15);
	                    auto tmp17 = decltype(tmp16)(tmp16 + tmp5);
	                    auto tmp18 = tmp16 < 0;
	                    auto tmp19 = tmp18 ? tmp17 : tmp16;
	                    auto tmp20 = tmp19;
	                    auto tmp21 = c10::convert<int64_t>(tmp20);
	                    TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L");
	                    auto tmp23 = in_ptr13[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))];
	                    out_ptr19[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12;
	                    out_ptr20[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23;
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const float* in_ptr0,
	                       const float* in_ptr1,
	                       const int64_t* in_ptr2,
	                       const bfloat16* in_ptr3,
	                       const float* in_ptr4,
	                       const float* in_ptr5,
	                       const bfloat16* in_ptr6,
	                       float* out_ptr0,
	                       float* out_ptr1,
	                       float* out_ptr2,
	                       float* out_ptr3,
	                       float* out_ptr4,
	                       float* out_ptr5,
	                       float* out_ptr6,
	                       float* out_ptr7,
	                       float* out_ptr8,
	                       bfloat16* out_ptr9,
	                       bfloat16* out_ptr10,
	                       bfloat16* out_ptr11,
	                       bfloat16* out_ptr12,
	                       bfloat16* out_ptr13,
	                       bfloat16* out_ptr14,
	                       bfloat16* out_ptr15,
	                       bfloat16* out_ptr16,
	                       bfloat16* out_ptr17)
	{
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr0 + static_cast<long>(x0));
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(768L + x0), 16);
	            tmp0.store(out_ptr1 + static_cast<long>(x0));
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = static_cast<float>(1.0);
	            auto tmp1 = at::vec::Vectorized<float>(tmp0);
	            tmp1.store(out_ptr2 + static_cast<long>(x0));
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
	        {
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L))
	            {
	                auto tmp0 = static_cast<float>(1.0);
	                auto tmp1 = at::vec::Vectorized<float>(tmp0);
	                tmp1.store(out_ptr3 + static_cast<long>(x1 + (448L*x0)));
	            }
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	            {
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
	                {
	                    auto tmp0 = in_ptr1[static_cast<long>(64L + x1)];
	                    auto tmp1 =
	                    [&]
	                    {
	                        __at_align__ std::array<int64_t, 16> tmpbuf;
	                        #pragma GCC unroll 16
	                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
	                        {
	                            tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((x2 + x2_inner + (2112L*x0)), 64L))];
	                        }
	                        return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
	                    }
	                    ()
	                    ;
	                    auto tmp16 = in_ptr1[static_cast<long>(704L + x1)];
	                    auto tmp17 =
	                    [&]
	                    {
	                        __at_align__ std::array<int64_t, 16> tmpbuf;
	                        #pragma GCC unroll 16
	                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
	                        {
	                            tmpbuf[x2_inner] = in_ptr2[static_cast<long>(c10::div_floor_integer((1920L + x2 + x2_inner + (2112L*x0)), 64L))];
	                        }
	                        return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
	                    }
	                    ()
	                    ;
	                    auto tmp2 = 13L;
	                    auto tmp3 = c10::convert<int64_t>(tmp2);
	                    auto tmp4 = at::vec::VectorizedN<int64_t,2>(tmp3);
	                    auto tmp5 = tmp1 + tmp4;
	                    auto tmp6 = static_cast<int64_t>(0);
	                    auto tmp7 = at::vec::VectorizedN<int64_t,2>(tmp6);
	                    auto tmp8 = at::vec::VecMask<int64_t,2>(tmp1 < tmp7);
	                    auto tmp9 = decltype(tmp5)::blendv(tmp1, tmp5, tmp8.template cast<int64_t,2>());
	                    auto tmp10 =
	                    [&]
	                    {
	                        __at_align__ std::array<int64_t, 16> tmpbuf;
	                        tmp9.store(tmpbuf.data());
	                        return tmpbuf;
	                    }
	                    ()
	                    ;
	                    auto tmp11 =
	                    [&]
	                    {
	                        __at_align__ std::array<int64_t, 16> tmpbuf;
	                        #pragma GCC unroll 16
	                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
	                        {
	                            tmpbuf[x2_inner] = static_cast<long>(tmp10[x2_inner]);
	                        }
	                        return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
	                    }
	                    ()
	                    ;
	                    TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp11) & (tmp11 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp11 < 13L");
	                    auto tmp13 =
	                    [&]
	                    {
	                        __at_align__ std::array<float, 16> tmpbuf;
	                        #pragma GCC unroll 16
	                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
	                        {
	                            tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp10[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))];
	                        }
	                        return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
	                    }
	                    ()
	                    ;
	                    auto tmp14 = at::vec::Vectorized<float>(tmp0);
	                    auto tmp15 = tmp14 * tmp13;
	                    auto tmp18 = tmp17 + tmp4;
	                    auto tmp19 = at::vec::VecMask<int64_t,2>(tmp17 < tmp7);
	                    auto tmp20 = decltype(tmp18)::blendv(tmp17, tmp18, tmp19.template cast<int64_t,2>());
	                    auto tmp21 =
	                    [&]
	                    {
	                        __at_align__ std::array<int64_t, 16> tmpbuf;
	                        tmp20.store(tmpbuf.data());
	                        return tmpbuf;
	                    }
	                    ()
	                    ;
	                    auto tmp22 =
	                    [&]
	                    {
	                        __at_align__ std::array<int64_t, 16> tmpbuf;
	                        #pragma GCC unroll 16
	                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
	                        {
	                            tmpbuf[x2_inner] = static_cast<long>(tmp21[x2_inner]);
	                        }
	                        return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
	                    }
	                    ()
	                    ;
	                    TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp22) & (tmp22 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp22 < 13L");
	                    auto tmp24 =
	                    [&]
	                    {
	                        __at_align__ std::array<float, 16> tmpbuf;
	                        #pragma GCC unroll 16
	                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
	                        {
	                            tmpbuf[x2_inner] = in_ptr1[static_cast<long>((64L*tmp21[x2_inner]) + (static_cast<long>((x2 + x2_inner)) % static_cast<long>(64L)))];
	                        }
	                        return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
	                    }
	                    ()
	                    ;
	                    auto tmp25 = at::vec::Vectorized<float>(tmp16);
	                    auto tmp26 = tmp25 * tmp24;
	                    tmp15.store(out_ptr4 + static_cast<long>(x2 + (448L*x1) + (28672L*x0)));
	                    tmp26.store(out_ptr5 + static_cast<long>(x2 + (448L*x1) + (28672L*x0)));
	                }
	            }
	        }
	    }
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
	            {
	                {
	                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16);
	                        auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp8 = at::vec::minimum(tmp6, tmp7);
	                        auto tmp9 = static_cast<float>(1.0);
	                        auto tmp10 = at::vec::Vectorized<float>(tmp9);
	                        auto tmp11 = tmp10 - tmp8;
	                        auto tmp12 = static_cast<float>(-10000.0);
	                        auto tmp13 = at::vec::Vectorized<float>(tmp12);
	                        auto tmp14 = tmp11 * tmp13;
	                        auto tmp15 = tmp5 + tmp14;
	                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15);
	                    }
	                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
	                    out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                {
	                    float tmp_acc0 = 0;
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1), 16);
	                        auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp16 = out_ptr6[static_cast<long>(x0)];
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp8 = at::vec::minimum(tmp6, tmp7);
	                        auto tmp9 = static_cast<float>(1.0);
	                        auto tmp10 = at::vec::Vectorized<float>(tmp9);
	                        auto tmp11 = tmp10 - tmp8;
	                        auto tmp12 = static_cast<float>(-10000.0);
	                        auto tmp13 = at::vec::Vectorized<float>(tmp12);
	                        auto tmp14 = tmp11 * tmp13;
	                        auto tmp15 = tmp5 + tmp14;
	                        auto tmp17 = at::vec::Vectorized<float>(tmp16);
	                        auto tmp18 = tmp15 - tmp17;
	                        auto tmp19 = tmp18.exp();
	                        tmp19.store(out_ptr7 + static_cast<long>(x1 + (448L*x0)));
	                        tmp_acc0_vec = tmp_acc0_vec + tmp19;
	                    }
	                    tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
	                    out_ptr8[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr7 + static_cast<long>(x1 + (448L*x0)), 16);
	                    auto tmp1 = out_ptr8[static_cast<long>(x0)];
	                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
	                    auto tmp3 = tmp0 / tmp2;
	                    auto tmp4 = at::vec::convert<bfloat16>(tmp3);
	                    tmp4.store(out_ptr9 + static_cast<long>(x1 + (448L*x0)), 16);
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	                    {
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x2 + (64L*x0) + (768L*x1)), 32);
	                            tmp0.store(out_ptr10 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                            tmp0.store(out_ptr11 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	                    {
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(49152L + x2 + (64L*x0) + (768L*x1)), 32);
	                            tmp0.store(out_ptr12 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	                    {
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32);
	                            tmp0.store(out_ptr13 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	                    {
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(589824L + x2 + (64L*x0) + (768L*x1)), 32);
	                            tmp0.store(out_ptr14 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                            tmp0.store(out_ptr15 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(192L); x1+=static_cast<long>(1L))
	                    {
	                        #pragma GCC ivdep
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                        {
	                            auto tmp0 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L)))];
	                            auto tmp13 = in_ptr2[static_cast<long>((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L)))];
	                            auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((x2 + (64L*x1)), 4096L))), 33L));
	                            auto tmp2 = c10::convert<int64_t>(tmp1);
	                            auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
	                            auto tmp4 = 156L;
	                            auto tmp5 = c10::convert<int64_t>(tmp4);
	                            auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
	                            auto tmp7 = tmp3 < 0;
	                            auto tmp8 = tmp7 ? tmp6 : tmp3;
	                            auto tmp9 = tmp8;
	                            auto tmp10 = c10::convert<int64_t>(tmp9);
	                            TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
	                            auto tmp12 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
	                            auto tmp14 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((122880L + x2 + (64L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((122880L + x2 + (64L*x1)), 4096L))), 33L));
	                            auto tmp15 = c10::convert<int64_t>(tmp14);
	                            auto tmp16 = decltype(tmp13)(tmp13 + tmp15);
	                            auto tmp17 = decltype(tmp16)(tmp16 + tmp5);
	                            auto tmp18 = tmp16 < 0;
	                            auto tmp19 = tmp18 ? tmp17 : tmp16;
	                            auto tmp20 = tmp19;
	                            auto tmp21 = c10::convert<int64_t>(tmp20);
	                            TORCH_CHECK((0 <= tmp21) & (tmp21 < 156L), "index out of bounds: 0 <= tmp21 < 156L");
	                            auto tmp23 = in_ptr6[static_cast<long>(x2 + (64L*(static_cast<long>(c10::div_floor_integer(tmp19, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x1) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp19) % static_cast<long>(13L))))];
	                            out_ptr16[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp12;
	                            out_ptr17[static_cast<long>(x2 + (64L*x1) + (28672L*x0))] = tmp23;
	                        }
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused_cat_clone_3 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       const bfloat16* in_ptr1,
	                       bfloat16* out_ptr0,
	                       bfloat16* out_ptr1,
	                       bfloat16* out_ptr2,
	                       bfloat16* out_ptr3)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                    {
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
	                            tmp0.store(out_ptr0 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                    {
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
	                            tmp0.store(out_ptr1 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                    {
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
	                            tmp0.store(out_ptr2 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L))
	                {
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(98304L + x2 + (64L*x0) + (768L*x1)), 32);
	                        tmp0.store(out_ptr3 + static_cast<long>(x2 + (64L*x1) + (36864L*x0)), 32);
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused_clone_4 = async_compile.cpp_pybinding(['const int64_t*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const int64_t* in_ptr0,
	                       const bfloat16* in_ptr1,
	                       const bfloat16* in_ptr2,
	                       bfloat16* out_ptr0,
	                       bfloat16* out_ptr1)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(1L))
	                    {
	                        #pragma GCC ivdep
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(1L))
	                        {
	                            auto tmp0 = in_ptr0[static_cast<long>((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L)))];
	                            auto tmp1 = 13L*(c10::div_floor_integer(((33L*(c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1) + (135168L*x0)), 135168L))) + (c10::div_floor_integer((12288L + x3 + (64L*x2) + (12288L*x1)), 4096L))), 33L));
	                            auto tmp2 = c10::convert<int64_t>(tmp1);
	                            auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
	                            auto tmp4 = 156L;
	                            auto tmp5 = c10::convert<int64_t>(tmp4);
	                            auto tmp6 = decltype(tmp3)(tmp3 + tmp5);
	                            auto tmp7 = tmp3 < 0;
	                            auto tmp8 = tmp7 ? tmp6 : tmp3;
	                            auto tmp9 = tmp8;
	                            auto tmp10 = c10::convert<int64_t>(tmp9);
	                            TORCH_CHECK((0 <= tmp10) & (tmp10 < 156L), "index out of bounds: 0 <= tmp10 < 156L");
	                            auto tmp12 = in_ptr1[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
	                            auto tmp13 = in_ptr2[static_cast<long>(x3 + (64L*(static_cast<long>(c10::div_floor_integer(tmp8, 13L)) % static_cast<long>(12L))) + (768L*(static_cast<long>(x2) % static_cast<long>(64L))) + (49152L*(static_cast<long>(tmp8) % static_cast<long>(13L))))];
	                            out_ptr0[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp12;
	                            out_ptr1[static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0))] = tmp13;
	                        }
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused__softmax__to_copy_add_cat_mul_rsub_5 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const bfloat16*', 'const float*', 'const int64_t*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       const float* in_ptr1,
	                       const bfloat16* in_ptr2,
	                       const float* in_ptr3,
	                       const bfloat16* in_ptr4,
	                       const float* in_ptr5,
	                       const int64_t* in_ptr6,
	                       const bfloat16* in_ptr7,
	                       const bfloat16* in_ptr8,
	                       const bfloat16* in_ptr9,
	                       bfloat16* out_ptr0,
	                       bfloat16* out_ptr1,
	                       bfloat16* out_ptr2,
	                       bfloat16* out_ptr3,
	                       float* out_ptr4,
	                       float* out_ptr5,
	                       float* out_ptr6,
	                       bfloat16* out_ptr7,
	                       bfloat16* out_ptr8,
	                       bfloat16* out_ptr9,
	                       bfloat16* out_ptr10)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
	            {
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (64L*x0)), 16);
	                    auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
	                    auto tmp1 = at::vec::convert<float>(tmp0);
	                    auto tmp2 = static_cast<float>(0.125);
	                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                    auto tmp4 = tmp1 * tmp3;
	                    auto tmp5 = (tmp4);
	                    auto tmp7 = static_cast<float>(1.0);
	                    auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                    auto tmp9 = tmp8 - tmp6;
	                    auto tmp10 = static_cast<float>(-10000.0);
	                    auto tmp11 = at::vec::Vectorized<float>(tmp10);
	                    auto tmp12 = tmp9 * tmp11;
	                    auto tmp13 = tmp5 + tmp12;
	                    auto tmp14 = at::vec::convert<bfloat16>(tmp13);
	                    tmp14.store(out_ptr0 + static_cast<long>(x1 + (512L*x0)), 16);
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(576L); x1+=static_cast<long>(1L))
	                {
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x2 + (192L*x1) + (110592L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x2 + (192L*x1)), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp7 = static_cast<float>(1.0);
	                        auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                        auto tmp9 = tmp8 - tmp6;
	                        auto tmp10 = static_cast<float>(-10000.0);
	                        auto tmp11 = at::vec::Vectorized<float>(tmp10);
	                        auto tmp12 = tmp9 * tmp11;
	                        auto tmp13 = tmp5 + tmp12;
	                        auto tmp14 = at::vec::convert<bfloat16>(tmp13);
	                        tmp14.store(out_ptr1 + static_cast<long>(x2 + (512L*x1) + (294912L*x0)), 16);
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                    {
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(192L); x3+=static_cast<long>(16L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x3 + (192L*x2) + (12288L*x1) + (110592L*x0)), 16);
	                            auto tmp6 = in_ptr5[static_cast<long>(128L + x2 + (64L*x1))];
	                            auto tmp7 =
	                            [&]
	                            {
	                                __at_align__ std::array<int64_t, 16> tmpbuf;
	                                #pragma GCC unroll 16
	                                for (long x3_inner = 0; x3_inner < 16; x3_inner++)
	                                {
	                                    tmpbuf[x3_inner] = in_ptr6[static_cast<long>(c10::div_floor_integer((192L + x3 + x3_inner + (192L*x1) + (2112L*x0)), 64L))];
	                                }
	                                return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
	                            }
	                            ()
	                            ;
	                            auto tmp1 = at::vec::convert<float>(tmp0);
	                            auto tmp2 = static_cast<float>(0.125);
	                            auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                            auto tmp4 = tmp1 * tmp3;
	                            auto tmp5 = (tmp4);
	                            auto tmp8 = 13L;
	                            auto tmp9 = c10::convert<int64_t>(tmp8);
	                            auto tmp10 = at::vec::VectorizedN<int64_t,2>(tmp9);
	                            auto tmp11 = tmp7 + tmp10;
	                            auto tmp12 = static_cast<int64_t>(0);
	                            auto tmp13 = at::vec::VectorizedN<int64_t,2>(tmp12);
	                            auto tmp14 = at::vec::VecMask<int64_t,2>(tmp7 < tmp13);
	                            auto tmp15 = decltype(tmp11)::blendv(tmp7, tmp11, tmp14.template cast<int64_t,2>());
	                            auto tmp16 =
	                            [&]
	                            {
	                                __at_align__ std::array<int64_t, 16> tmpbuf;
	                                tmp15.store(tmpbuf.data());
	                                return tmpbuf;
	                            }
	                            ()
	                            ;
	                            auto tmp17 =
	                            [&]
	                            {
	                                __at_align__ std::array<int64_t, 16> tmpbuf;
	                                #pragma GCC unroll 16
	                                for (long x3_inner = 0; x3_inner < 16; x3_inner++)
	                                {
	                                    tmpbuf[x3_inner] = static_cast<long>(tmp16[x3_inner]);
	                                }
	                                return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
	                            }
	                            ()
	                            ;
	                            TORCH_CHECK((at::vec::VecMask<int64_t,2>((at::vec::VectorizedN<int64_t,2>(0) <= tmp17) & (tmp17 < at::vec::VectorizedN<int64_t,2>(13L)))).all_masked(), "index out of bounds: 0 <= tmp17 < 13L");
	                            auto tmp19 =
	                            [&]
	                            {
	                                __at_align__ std::array<float, 16> tmpbuf;
	                                #pragma GCC unroll 16
	                                for (long x3_inner = 0; x3_inner < 16; x3_inner++)
	                                {
	                                    tmpbuf[x3_inner] = in_ptr5[static_cast<long>((64L*tmp16[x3_inner]) + (static_cast<long>((x3 + x3_inner)) % static_cast<long>(64L)))];
	                                }
	                                return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
	                            }
	                            ()
	                            ;
	                            auto tmp20 = at::vec::Vectorized<float>(tmp6);
	                            auto tmp21 = tmp20 * tmp19;
	                            auto tmp22 = static_cast<float>(1.0);
	                            auto tmp23 = at::vec::Vectorized<float>(tmp22);
	                            auto tmp24 = tmp23 - tmp21;
	                            auto tmp25 = static_cast<float>(-10000.0);
	                            auto tmp26 = at::vec::Vectorized<float>(tmp25);
	                            auto tmp27 = tmp24 * tmp26;
	                            auto tmp28 = tmp5 + tmp27;
	                            auto tmp29 = at::vec::convert<bfloat16>(tmp28);
	                            tmp29.store(out_ptr2 + static_cast<long>(x3 + (512L*x2) + (32768L*x1) + (294912L*x0)), 16);
	                        }
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
	            {
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (64L*x0)), 16);
	                    auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(768L + x1), 16);
	                    auto tmp1 = at::vec::convert<float>(tmp0);
	                    auto tmp2 = static_cast<float>(0.125);
	                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                    auto tmp4 = tmp1 * tmp3;
	                    auto tmp5 = (tmp4);
	                    auto tmp7 = static_cast<float>(1.0);
	                    auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                    auto tmp9 = tmp8 - tmp6;
	                    auto tmp10 = static_cast<float>(-10000.0);
	                    auto tmp11 = at::vec::Vectorized<float>(tmp10);
	                    auto tmp12 = tmp9 * tmp11;
	                    auto tmp13 = tmp5 + tmp12;
	                    auto tmp14 = at::vec::convert<bfloat16>(tmp13);
	                    tmp14.store(out_ptr3 + static_cast<long>(x1 + (512L*x0)), 16);
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(6912L); x0+=static_cast<long>(1L))
	            {
	                {
	                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp1);
	                    }
	                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
	                    out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                {
	                    float tmp_acc0 = 0;
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x1 + (512L*x0)), 16);
	                        auto tmp2 = out_ptr4[static_cast<long>(x0)];
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 - tmp3;
	                        auto tmp5 = tmp4.exp();
	                        tmp5.store(out_ptr5 + static_cast<long>(x1 + (512L*x0)));
	                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
	                    }
	                    tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
	                    out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(512L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (512L*x0)), 16);
	                    auto tmp1 = out_ptr6[static_cast<long>(x0)];
	                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
	                    auto tmp3 = tmp0 / tmp2;
	                    auto tmp4 = at::vec::convert<bfloat16>(tmp3);
	                    tmp4.store(out_ptr7 + static_cast<long>(x1 + (512L*x0)), 16);
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                    {
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(49152L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
	                            tmp0.store(out_ptr8 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                    {
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(98304L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
	                            tmp0.store(out_ptr9 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(9L); x1+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(1L))
	                    {
	                        for(long x3=static_cast<long>(0L); x3<static_cast<long>(64L); x3+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr9 + static_cast<long>(147456L + x3 + (64L*x0) + (768L*x2) + (49152L*x1)), 32);
	                            tmp0.store(out_ptr10 + static_cast<long>(x3 + (64L*x2) + (12288L*x1) + (110592L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused_cat_6 = async_compile.cpp_pybinding(['const bfloat16*', 'bfloat16*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       bfloat16* out_ptr0,
	                       bfloat16* out_ptr1)
	{
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	            {
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32);
	                    tmp0.store(out_ptr0 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                }
	            }
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            #pragma GCC ivdep
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	            {
	                for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32);
	                    tmp0.store(out_ptr1 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused__softmax_add_cat_minimum_mul_rsub_7 = async_compile.cpp_pybinding(['const float*', 'const bfloat16*', 'const float*', 'const float*', 'const bfloat16*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'bfloat16*', 'bfloat16*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const float* in_ptr0,
	                       const bfloat16* in_ptr1,
	                       const float* in_ptr2,
	                       const float* in_ptr3,
	                       const bfloat16* in_ptr4,
	                       float* out_ptr0,
	                       float* out_ptr1,
	                       float* out_ptr2,
	                       float* out_ptr3,
	                       float* out_ptr4,
	                       float* out_ptr5,
	                       float* out_ptr6,
	                       bfloat16* out_ptr7,
	                       bfloat16* out_ptr8,
	                       bfloat16* out_ptr9)
	{
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(64L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
	            tmp0.store(out_ptr0 + static_cast<long>(x0));
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(640L + x0), 16);
	            tmp0.store(out_ptr1 + static_cast<long>(x0));
	        }
	    }
	    {
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(192L); x0+=static_cast<long>(16L))
	        {
	            auto tmp0 = static_cast<float>(1.0);
	            auto tmp1 = at::vec::Vectorized<float>(tmp0);
	            tmp1.store(out_ptr2 + static_cast<long>(x0));
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
	        {
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(256L); x1+=static_cast<long>(16L))
	            {
	                auto tmp0 = static_cast<float>(1.0);
	                auto tmp1 = at::vec::Vectorized<float>(tmp0);
	                tmp1.store(out_ptr3 + static_cast<long>(x1 + (448L*x0)));
	            }
	        }
	    }
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
	            {
	                {
	                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
	                        auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp8 = at::vec::minimum(tmp6, tmp7);
	                        auto tmp9 = static_cast<float>(1.0);
	                        auto tmp10 = at::vec::Vectorized<float>(tmp9);
	                        auto tmp11 = tmp10 - tmp8;
	                        auto tmp12 = static_cast<float>(-10000.0);
	                        auto tmp13 = at::vec::Vectorized<float>(tmp12);
	                        auto tmp14 = tmp11 * tmp13;
	                        auto tmp15 = tmp5 + tmp14;
	                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp15);
	                    }
	                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
	                    out_ptr4[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                {
	                    float tmp_acc0 = 0;
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
	                        auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (448L*x0)), 16);
	                        auto tmp16 = out_ptr4[static_cast<long>(x0)];
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp8 = at::vec::minimum(tmp6, tmp7);
	                        auto tmp9 = static_cast<float>(1.0);
	                        auto tmp10 = at::vec::Vectorized<float>(tmp9);
	                        auto tmp11 = tmp10 - tmp8;
	                        auto tmp12 = static_cast<float>(-10000.0);
	                        auto tmp13 = at::vec::Vectorized<float>(tmp12);
	                        auto tmp14 = tmp11 * tmp13;
	                        auto tmp15 = tmp5 + tmp14;
	                        auto tmp17 = at::vec::Vectorized<float>(tmp16);
	                        auto tmp18 = tmp15 - tmp17;
	                        auto tmp19 = tmp18.exp();
	                        tmp19.store(out_ptr5 + static_cast<long>(x1 + (448L*x0)));
	                        tmp_acc0_vec = tmp_acc0_vec + tmp19;
	                    }
	                    tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
	                    out_ptr6[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(448L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr5 + static_cast<long>(x1 + (448L*x0)), 16);
	                    auto tmp1 = out_ptr6[static_cast<long>(x0)];
	                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
	                    auto tmp3 = tmp0 / tmp2;
	                    auto tmp4 = at::vec::convert<bfloat16>(tmp3);
	                    tmp4.store(out_ptr7 + static_cast<long>(x1 + (448L*x0)), 16);
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	                    {
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(491520L + x2 + (64L*x0) + (768L*x1)), 32);
	                            tmp0.store(out_ptr8 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    #pragma GCC ivdep
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
	                    {
	                        for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(32L))
	                        {
	                            auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(540672L + x2 + (64L*x0) + (768L*x1)), 32);
	                            tmp0.store(out_ptr9 + static_cast<long>(x2 + (64L*x1) + (28672L*x0)), 32);
	                        }
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused__softmax_add_mul_rsub_8 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'float*', 'float*', 'float*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       const float* in_ptr1,
	                       float* out_ptr0,
	                       float* out_ptr1,
	                       float* out_ptr2,
	                       bfloat16* out_ptr3)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(768L); x0+=static_cast<long>(1L))
	            {
	                {
	                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp7 = static_cast<float>(1.0);
	                        auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                        auto tmp9 = tmp8 - tmp6;
	                        auto tmp10 = static_cast<float>(-10000.0);
	                        auto tmp11 = at::vec::Vectorized<float>(tmp10);
	                        auto tmp12 = tmp9 * tmp11;
	                        auto tmp13 = tmp5 + tmp12;
	                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp13);
	                    }
	                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
	                    out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                {
	                    float tmp_acc0 = 0;
	                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (832L*x0)), 16);
	                        auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1), 16);
	                        auto tmp14 = out_ptr0[static_cast<long>(x0)];
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp2 = static_cast<float>(0.125);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        auto tmp5 = (tmp4);
	                        auto tmp7 = static_cast<float>(1.0);
	                        auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                        auto tmp9 = tmp8 - tmp6;
	                        auto tmp10 = static_cast<float>(-10000.0);
	                        auto tmp11 = at::vec::Vectorized<float>(tmp10);
	                        auto tmp12 = tmp9 * tmp11;
	                        auto tmp13 = tmp5 + tmp12;
	                        auto tmp15 = at::vec::Vectorized<float>(tmp14);
	                        auto tmp16 = tmp13 - tmp15;
	                        auto tmp17 = tmp16.exp();
	                        tmp17.store(out_ptr1 + static_cast<long>(x1 + (832L*x0)));
	                        tmp_acc0_vec = tmp_acc0_vec + tmp17;
	                    }
	                    tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
	                    out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<long>(x1 + (832L*x0)), 16);
	                    auto tmp1 = out_ptr2[static_cast<long>(x0)];
	                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
	                    auto tmp3 = tmp0 / tmp2;
	                    auto tmp4 = at::vec::convert<bfloat16>(tmp3);
	                    tmp4.store(out_ptr3 + static_cast<long>(x1 + (832L*x0)), 16);
	                }
	            }
	        }
	    }
	}
	''')


	cpp_fused_cat_mul_9 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'const float*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'bfloat16*', 'float*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       const bfloat16* in_ptr1,
	                       const bfloat16* in_ptr2,
	                       const bfloat16* in_ptr3,
	                       const bfloat16* in_ptr4,
	                       const bfloat16* in_ptr5,
	                       const bfloat16* in_ptr6,
	                       const bfloat16* in_ptr7,
	                       const bfloat16* in_ptr8,
	                       const float* in_ptr9,
	                       bfloat16* out_ptr0,
	                       bfloat16* out_ptr1,
	                       bfloat16* out_ptr2,
	                       bfloat16* out_ptr3,
	                       bfloat16* out_ptr4,
	                       float* out_ptr5)
	{
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
	            {
	                auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (4096L*x0)), 32);
	                tmp0.store(out_ptr0 + static_cast<long>(x1 + (53248L*x0)), 32);
	            }
	        }
	    }
	    {
	        #pragma GCC ivdep
	        for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	        {
	            for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
	            {
	                auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr1 + static_cast<long>(x1 + (4096L*x0)), 32);
	                tmp0.store(out_ptr1 + static_cast<long>(x1 + (53248L*x0)), 32);
	            }
	        }
	    }
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(36864L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr2 + static_cast<long>(x1 + (36864L*x0)), 16);
	                    auto tmp2 = at::vec::Vectorized<bfloat16>::loadu(in_ptr3 + static_cast<long>(x1 + (36864L*x0)), 16);
	                    auto tmp5 = at::vec::Vectorized<bfloat16>::loadu(in_ptr4 + static_cast<long>(x1 + (36864L*x0)), 16);
	                    auto tmp8 = at::vec::Vectorized<bfloat16>::loadu(in_ptr5 + static_cast<long>(x1 + (36864L*x0)), 16);
	                    auto tmp1 = at::vec::convert<float>(tmp0);
	                    auto tmp3 = at::vec::convert<float>(tmp2);
	                    auto tmp4 = tmp1 + tmp3;
	                    auto tmp6 = at::vec::convert<float>(tmp5);
	                    auto tmp7 = tmp4 + tmp6;
	                    auto tmp9 = at::vec::convert<float>(tmp8);
	                    auto tmp10 = tmp7 + tmp9;
	                    auto tmp11 = at::vec::convert<bfloat16>(tmp10);
	                    tmp11.store(out_ptr2 + static_cast<long>(x1 + (53248L*x0)), 16);
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr6 + static_cast<long>(x1 + (4096L*x0)), 32);
	                        tmp0.store(out_ptr3 + static_cast<long>(x1 + (53248L*x0)), 32);
	                    }
	                }
	            }
	        }
	        #pragma omp single
	        {
	            {
	                #pragma GCC ivdep
	                for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	                {
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(4096L); x1+=static_cast<long>(32L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr7 + static_cast<long>(x1 + (4096L*x0)), 32);
	                        tmp0.store(out_ptr4 + static_cast<long>(x1 + (53248L*x0)), 32);
	                    }
	                }
	            }
	        }
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(12L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(832L); x1+=static_cast<long>(1L))
	                {
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr8 + static_cast<long>(x2 + (64L*x1) + (53248L*x0)), 16);
	                        auto tmp2 = in_ptr9[static_cast<long>(x1)];
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                        auto tmp4 = tmp1 * tmp3;
	                        tmp4.store(out_ptr5 + static_cast<long>(x2 + (64L*x1) + (53248L*x0)));
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1 = args
	    args.clear()
	    assert_size_stride(arg0_1, (11, 3), (3, 1))
	    assert_size_stride(arg1_1, (11, 3), (3, 1))
	    assert_size_stride(arg2_1, (11, 3), (3, 1))
	    assert_size_stride(arg3_1, (11, 3), (3, 1))
	    assert_size_stride(arg4_1, (11, 3), (3, 1))
	    assert_size_stride(arg5_1, (11, 3), (3, 1))
	    assert_size_stride(arg6_1, (11, 3), (3, 1))
	    assert_size_stride(arg7_1, (11, 3), (3, 1))
	    assert_size_stride(arg8_1, (11, 3), (3, 1))
	    assert_size_stride(arg9_1, (11, 3), (3, 1))
	    assert_size_stride(arg10_1, (11, 3), (3, 1))
	    assert_size_stride(arg11_1, (11, 3), (3, 1))
	    assert_size_stride(arg12_1, (1, 12, 832, 64), (638976, 64, 768, 1))
	    assert_size_stride(arg13_1, (1, 13, 64), (832, 64, 1))
	    assert_size_stride(arg14_1, (1, 12, 832, 64), (638976, 64, 768, 1))
	    assert_size_stride(arg15_1, (1, 12, 832, 64), (638976, 64, 768, 1))
	    assert_size_stride(arg16_1, (1, 1, 1, 832), (832, 832, 832, 1))
	    assert_size_stride(arg17_1, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1))
	    assert_size_stride(arg18_1, (1, 1, 832, 1), (832, 832, 1, 1))
	    buf0 = empty_strided_cpu((12, 64, 832), (53248, 832, 1), torch.bfloat16)
	    # Source Nodes: [bmm], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 0), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf0)
	    buf1 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32)
	    buf2 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.float32)
	    buf3 = empty_strided_cpu((1, 12, 64, 1), (768, 64, 1, 768), torch.float32)
	    buf4 = empty_strided_cpu((1, 12, 64, 832), (638976, 53248, 832, 1), torch.bfloat16)
	    cpp_fused__softmax_add_mul_rsub_0(buf0, arg16_1, buf1, buf2, buf3, buf4)
	    buf5 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
	    # Source Nodes: [bmm_1], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf5)
	    buf18 = empty_strided_cpu((132, 3), (3, 1), torch.int32)
	    buf6 = reinterpret_tensor(buf18, (11, 3), (3, 1), 0)  # alias
	    buf7 = reinterpret_tensor(buf18, (11, 3), (3, 1), 33)  # alias
	    buf8 = reinterpret_tensor(buf18, (11, 3), (3, 1), 66)  # alias
	    buf9 = reinterpret_tensor(buf18, (11, 3), (3, 1), 99)  # alias
	    buf10 = reinterpret_tensor(buf18, (11, 3), (3, 1), 132)  # alias
	    buf11 = reinterpret_tensor(buf18, (11, 3), (3, 1), 165)  # alias
	    buf12 = reinterpret_tensor(buf18, (11, 3), (3, 1), 198)  # alias
	    buf13 = reinterpret_tensor(buf18, (11, 3), (3, 1), 231)  # alias
	    buf14 = reinterpret_tensor(buf18, (11, 3), (3, 1), 264)  # alias
	    buf15 = reinterpret_tensor(buf18, (11, 3), (3, 1), 297)  # alias
	    buf16 = reinterpret_tensor(buf18, (11, 3), (3, 1), 330)  # alias
	    buf17 = reinterpret_tensor(buf18, (11, 3), (3, 1), 363)  # alias
	    buf19 = empty_strided_cpu((12, 11, 3), (33, 3, 1), torch.int64)
	    buf25 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
	    buf20 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 0)  # alias
	    buf78 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
	    buf73 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 0)  # alias
	    buf21 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096)  # alias
	    buf22 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192)  # alias
	    buf23 = reinterpret_tensor(buf25, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288)  # alias
	    buf76 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288)  # alias
	    buf24 = reinterpret_tensor(buf25, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384)  # alias
	    buf77 = reinterpret_tensor(buf78, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384)  # alias
	    cpp_fused__to_copy_cat_stack_1(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, buf18, arg14_1, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf19, buf20, buf73, buf21, buf22, buf23, buf76, buf24, buf77)
	    del arg0_1
	    del arg10_1
	    del arg11_1
	    del arg1_1
	    del arg2_1
	    del arg3_1
	    del arg4_1
	    del arg5_1
	    del arg6_1
	    del arg7_1
	    del arg8_1
	    del arg9_1
	    del buf10
	    del buf11
	    del buf12
	    del buf13
	    del buf14
	    del buf15
	    del buf16
	    del buf17
	    del buf18
	    del buf20
	    del buf21
	    del buf22
	    del buf23
	    del buf24
	    del buf6
	    del buf7
	    del buf8
	    del buf9
	    buf26 = empty_strided_cpu((12, 64, 448), (28672, 448, 1), torch.bfloat16)
	    # Source Nodes: [bmm_2], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 49152), reinterpret_tensor(buf25, (12, 64, 448), (28672, 1, 64), 0), out=buf26)
	    buf30 = empty_strided_cpu((1, 1, 1, 448), (448, 448, 448, 1), torch.float32)
	    buf27 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 0)  # alias
	    buf28 = reinterpret_tensor(buf30, (1, 1, 1, 64), (448, 448, 448, 1), 192)  # alias
	    buf29 = reinterpret_tensor(buf30, (1, 1, 1, 192), (448, 448, 448, 1), 256)  # alias
	    buf33 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
	    buf31 = reinterpret_tensor(buf33, (1, 12, 64, 256), (344064, 28672, 448, 1), 0)  # alias
	    buf32 = reinterpret_tensor(buf33, (1, 12, 64, 192), (344064, 28672, 448, 1), 256)  # alias
	    buf86 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
	    buf85 = reinterpret_tensor(buf86, (1, 12, 64, 192), (344064, 28672, 448, 1), 256)  # alias
	    buf34 = buf3; del buf3  # reuse
	    buf35 = empty_strided_cpu((1, 12, 64, 448), (344064, 28672, 448, 1), torch.float32)
	    buf36 = buf1; del buf1  # reuse
	    buf43 = reinterpret_tensor(buf25, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf25  # reuse
	    buf42 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
	    buf37 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 0)  # alias
	    buf95 = empty_strided_cpu((1, 12, 448, 64), (344064, 28672, 64, 1), torch.bfloat16)
	    buf90 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 0)  # alias
	    buf38 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096)  # alias
	    buf39 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192)  # alias
	    buf40 = reinterpret_tensor(buf42, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288)  # alias
	    buf93 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 12288)  # alias
	    buf41 = reinterpret_tensor(buf42, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384)  # alias
	    buf94 = reinterpret_tensor(buf95, (1, 12, 192, 64), (344064, 28672, 64, 1), 16384)  # alias
	    cpp_fused__softmax_add_cat_minimum_mul_new_ones_rsub_2(arg16_1, arg13_1, buf19, buf26, buf30, buf33, arg15_1, buf27, buf28, buf29, buf31, buf32, buf85, buf34, buf35, buf36, buf43, buf37, buf90, buf38, buf39, buf40, buf93, buf41, buf94)
	    del buf26
	    del buf27
	    del buf28
	    del buf29
	    del buf31
	    del buf32
	    del buf33
	    del buf37
	    del buf38
	    del buf39
	    del buf40
	    del buf41
	    buf44 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
	    # Source Nodes: [bmm_3], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf42, (12, 448, 64), (28672, 64, 1), 0), out=buf44)
	    del buf42
	    buf45 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
	    # Source Nodes: [first_band_product], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 0), out=buf45)
	    buf49 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16)
	    buf46 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0)  # alias
	    buf47 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096)  # alias
	    buf48 = reinterpret_tensor(buf49, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192)  # alias
	    buf50 = empty_strided_cpu((1, 12, 9, 64, 64), (442368, 36864, 4096, 64, 1), torch.bfloat16)
	    cpp_fused_cat_clone_3(arg14_1, arg12_1, buf46, buf47, buf48, buf50)
	    del buf46
	    del buf47
	    del buf48
	    buf51 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16)
	    # Source Nodes: [bmm_4], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf49, (108, 64, 192), (12288, 1, 64), 0), out=buf51)
	    buf52 = buf49; del buf49  # reuse
	    buf69 = empty_strided_cpu((1, 12, 9, 192, 64), (1327104, 110592, 12288, 64, 1), torch.bfloat16)
	    cpp_fused_clone_4(buf19, arg14_1, arg15_1, buf52, buf69)
	    buf53 = empty_strided_cpu((108, 64, 192), (12288, 192, 1), torch.bfloat16)
	    # Source Nodes: [bmm_5], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf50, (108, 64, 64), (4096, 64, 1), 0), reinterpret_tensor(buf52, (108, 64, 192), (12288, 1, 64), 0), out=buf53)
	    buf54 = reinterpret_tensor(buf50, (12, 576, 64), (36864, 64, 1), 0); del buf50  # reuse
	    # Source Nodes: [last_band_product], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 576, 64), (64, 768, 1), 98304), reinterpret_tensor(arg14_1, (12, 64, 64), (64, 1, 768), 589824), out=buf54)
	    buf59 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16)
	    buf55 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 0)  # alias
	    buf56 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 64)  # alias
	    buf57 = reinterpret_tensor(buf59, (1, 12, 9, 64, 192), (3538944, 294912, 32768, 512, 1), 256)  # alias
	    buf58 = reinterpret_tensor(buf59, (1, 12, 9, 64, 64), (3538944, 294912, 32768, 512, 1), 448)  # alias
	    buf60 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32)
	    buf61 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.float32)
	    buf62 = empty_strided_cpu((1, 12, 9, 64, 1), (6912, 576, 64, 1, 6912), torch.float32)
	    buf67 = empty_strided_cpu((1, 12, 9, 64, 512), (3538944, 294912, 32768, 512, 1), torch.bfloat16)
	    buf66 = buf52; del buf52  # reuse
	    buf63 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 0)  # alias
	    buf64 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 4096)  # alias
	    buf65 = reinterpret_tensor(buf66, (1, 12, 9, 64, 64), (1327104, 110592, 12288, 64, 1), 8192)  # alias
	    cpp_fused__softmax__to_copy_add_cat_mul_rsub_5(buf45, arg16_1, buf51, arg17_1, buf53, arg13_1, buf19, buf54, buf59, arg15_1, buf55, buf56, buf57, buf58, buf60, buf61, buf62, buf67, buf63, buf64, buf65)
	    del arg13_1
	    del arg17_1
	    del buf51
	    del buf53
	    del buf55
	    del buf56
	    del buf57
	    del buf58
	    del buf59
	    del buf60
	    del buf61
	    del buf62
	    del buf63
	    del buf64
	    del buf65
	    buf68 = reinterpret_tensor(buf54, (108, 64, 64), (4096, 64, 1), 0); del buf54  # reuse
	    # Source Nodes: [bmm_6], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 64), reinterpret_tensor(buf66, (108, 192, 64), (12288, 64, 1), 0), out=buf68)
	    del buf66
	    buf70 = reinterpret_tensor(buf45, (108, 64, 64), (4096, 64, 1), 0); del buf45  # reuse
	    # Source Nodes: [bmm_7], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf67, (108, 64, 192), (32768, 512, 1), 256), reinterpret_tensor(buf69, (108, 192, 64), (12288, 64, 1), 0), out=buf70)
	    del buf69
	    buf71 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
	    # Source Nodes: [einsum_3], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 0), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 0), out=buf71)
	    buf72 = empty_strided_cpu((12, 576, 64), (36864, 64, 1), torch.bfloat16)
	    # Source Nodes: [einsum_4], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf67, (12, 576, 64), (294912, 512, 1), 448), reinterpret_tensor(arg15_1, (12, 64, 64), (64, 768, 1), 589824), out=buf72)
	    del buf67
	    buf74 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096)  # alias
	    buf75 = reinterpret_tensor(buf78, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192)  # alias
	    cpp_fused_cat_6(arg14_1, buf74, buf75)
	    del buf73
	    del buf74
	    del buf75
	    del buf76
	    del buf77
	    buf79 = reinterpret_tensor(buf43, (12, 64, 448), (28672, 448, 1), 0); del buf43  # reuse
	    # Source Nodes: [bmm_8], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 540672), reinterpret_tensor(buf78, (12, 64, 448), (28672, 1, 64), 0), out=buf79)
	    buf83 = buf30; del buf30  # reuse
	    buf80 = reinterpret_tensor(buf83, (1, 1, 1, 64), (448, 448, 448, 1), 0)  # alias
	    buf81 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 64)  # alias
	    buf82 = reinterpret_tensor(buf83, (1, 1, 1, 192), (448, 448, 448, 1), 256)  # alias
	    buf84 = reinterpret_tensor(buf86, (1, 12, 64, 256), (344064, 28672, 448, 1), 0)  # alias
	    buf87 = buf36; del buf36  # reuse
	    buf88 = buf35; del buf35  # reuse
	    buf89 = buf34; del buf34  # reuse
	    buf96 = reinterpret_tensor(buf78, (1, 12, 64, 448), (344064, 28672, 448, 1), 0); del buf78  # reuse
	    buf91 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 4096)  # alias
	    buf92 = reinterpret_tensor(buf95, (1, 12, 64, 64), (344064, 28672, 64, 1), 8192)  # alias
	    cpp_fused__softmax_add_cat_minimum_mul_rsub_7(arg16_1, buf79, buf83, buf86, arg15_1, buf80, buf81, buf82, buf84, buf87, buf88, buf89, buf96, buf91, buf92)
	    del buf79
	    del buf80
	    del buf81
	    del buf82
	    del buf83
	    del buf84
	    del buf85
	    del buf86
	    del buf88
	    del buf90
	    del buf91
	    del buf92
	    del buf93
	    del buf94
	    buf97 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
	    # Source Nodes: [bmm_9], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf96, (12, 64, 448), (28672, 448, 1), 0), reinterpret_tensor(buf95, (12, 448, 64), (28672, 64, 1), 0), out=buf97)
	    del buf95
	    del buf96
	    buf98 = reinterpret_tensor(buf4, (12, 64, 832), (53248, 832, 1), 0); del buf4  # reuse
	    # Source Nodes: [bmm_10], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(arg12_1, (12, 64, 64), (64, 768, 1), 589824), reinterpret_tensor(arg14_1, (12, 64, 832), (64, 1, 768), 0), out=buf98)
	    del arg12_1
	    del arg14_1
	    buf99 = buf89; del buf89  # reuse
	    buf100 = buf2; del buf2  # reuse
	    buf101 = buf87; del buf87  # reuse
	    buf102 = reinterpret_tensor(buf0, (1, 12, 64, 832), (638976, 53248, 832, 1), 0); del buf0  # reuse
	    cpp_fused__softmax_add_mul_rsub_8(buf98, arg16_1, buf99, buf100, buf101, buf102)
	    del arg16_1
	    del buf101
	    del buf98
	    del buf99
	    buf103 = empty_strided_cpu((12, 64, 64), (4096, 64, 1), torch.bfloat16)
	    # Source Nodes: [bmm_11], Original ATen: [aten.bmm]
	    extern_kernels.bmm(reinterpret_tensor(buf102, (12, 64, 832), (53248, 832, 1), 0), reinterpret_tensor(arg15_1, (12, 832, 64), (64, 768, 1), 0), out=buf103)
	    del arg15_1
	    buf109 = reinterpret_tensor(buf102, (1, 12, 13, 64, 64), (638976, 53248, 4096, 64, 1), 0); del buf102  # reuse
	    buf104 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 0)  # alias
	    buf105 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 4096)  # alias
	    buf106 = reinterpret_tensor(buf109, (1, 12, 9, 64, 64), (638976, 53248, 4096, 64, 1), 8192)  # alias
	    buf107 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 45056)  # alias
	    buf108 = reinterpret_tensor(buf109, (1, 12, 1, 64, 64), (638976, 53248, 4096, 64, 1), 49152)  # alias
	    buf110 = reinterpret_tensor(buf100, (1, 12, 832, 64), (638976, 53248, 64, 1), 0); del buf100  # reuse
	    cpp_fused_cat_mul_9(buf5, buf44, buf68, buf70, buf71, buf72, buf97, buf103, buf109, arg18_1, buf104, buf105, buf106, buf107, buf108, buf110)
	    del arg18_1
	    return (reinterpret_tensor(buf110, (1, 832, 12, 64), (638976, 64, 53248, 1), 0), reinterpret_tensor(buf19, (1, 12, 11, 3), (396, 33, 3, 1), 0), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    arg0_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg1_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg2_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg3_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg4_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg5_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg6_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg7_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg8_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg9_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg10_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg11_1 = rand_strided((11, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg12_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
	    arg13_1 = rand_strided((1, 13, 64), (832, 64, 1), device='cpu', dtype=torch.float32)
	    arg14_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
	    arg15_1 = rand_strided((1, 12, 832, 64), (638976, 64, 768, 1), device='cpu', dtype=torch.bfloat16)
	    arg16_1 = rand_strided((1, 1, 1, 832), (832, 832, 832, 1), device='cpu', dtype=torch.float32)
	    arg17_1 = rand_strided((1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), device='cpu', dtype=torch.float32)
	    arg18_1 = rand_strided((1, 1, 832, 1), (832, 832, 1, 1), device='cpu', dtype=torch.float32)
	    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:09.567000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:09.568000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0, "has_payload": "81a28a443bd0d99705f0b5d2b9a46edc"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['bsz'], accessed_by=DictGetItemGuardAccessor(bsz)
	| | +- EQUALS_MATCH: L['bsz'] == 1
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274384)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
	| | +- EQUALS_MATCH: L['n_heads'] == 12
	| +- GuardManager: source=L['rsqrt_d'], accessed_by=DictGetItemGuardAccessor(rsqrt_d)
	| | +- EQUALS_MATCH: L['rsqrt_d'] == 0.125
	| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
	| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
	| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
	| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
	| | +- LENGTH_CHECK: len(L['___stack0']) == 12
	| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[11, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
	| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
	| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
	| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['key_layer'], accessed_by=DictGetItemGuardAccessor(key_layer)
	| | +- TENSOR_MATCH: check_tensor(L['key_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
	| | +- NO_HASATTR: hasattr(L['key_layer'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
	| | +- EQUALS_MATCH: L['batch_size'] == 1
	| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
	| | +- EQUALS_MATCH: L['to_seq_len'] == 832
	| +- GuardManager: source=L['query_layer'], accessed_by=DictGetItemGuardAccessor(query_layer)
	| | +- TENSOR_MATCH: check_tensor(L['query_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
	| | +- NO_HASATTR: hasattr(L['query_layer'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['value_layer'], accessed_by=DictGetItemGuardAccessor(value_layer)
	| | +- TENSOR_MATCH: check_tensor(L['value_layer'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.bfloat16, device=None, requires_grad=False, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1])
	| | +- NO_HASATTR: hasattr(L['value_layer'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
	| | +- EQUALS_MATCH: L['from_seq_len'] == 832
	| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
	| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
	| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
	| | +- EQUALS_MATCH: L['to_block_size'] == 64
	| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
	| | +- EQUALS_MATCH: L['from_block_size'] == 64
	| +- GuardManager: source=L['attn_mask_penalty'], accessed_by=DictGetItemGuardAccessor(attn_mask_penalty)
	| | +- EQUALS_MATCH: L['attn_mask_penalty'] == -10000.0
	| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
	| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['key_layer'], L['query_layer'], L['value_layer'], L['from_blocked_mask'], ___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['nn'], accessed_by=DictGetItemGuardAccessor(nn)
	| | | +- ID_MATCH: ___check_obj_id(G['nn'], 139842442593680)
	| | | +- GuardManager: source=G['nn'].functional, accessed_by=GetAttrGuardAccessor(functional)
	| | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional, 139842441627024)
	| | | | +- GuardManager: source=G['nn'].functional.softmax, accessed_by=GetAttrGuardAccessor(softmax)
	| | | | | +- ID_MATCH: ___check_obj_id(G['nn'].functional.softmax, 139842422997488)
	| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
	| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
	| | | +- GuardManager: source=G['np'].stack, accessed_by=GetAttrGuardAccessor(stack)
	| | | | +- ID_MATCH: ___check_obj_id(G['np'].stack, 139844763318256)
	| | +- GuardManager: source=G['torch'], accessed_by=DictGetItemGuardAccessor(torch)
	| | | +- ID_MATCH: ___check_obj_id(G['torch'], 139845236322800)
	| | | +- GuardManager: source=G['torch'].bmm, accessed_by=GetAttrGuardAccessor(bmm)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].bmm, 139845228834192)
	| | | +- GuardManager: source=G['torch'].cat, accessed_by=GetAttrGuardAccessor(cat)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].cat, 139845228834672)
	| | | +- GuardManager: source=G['torch'].div, accessed_by=GetAttrGuardAccessor(div)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].div, 139845228790304)
	| | | +- GuardManager: source=G['torch'].long, accessed_by=GetAttrGuardAccessor(long)
	| | | | +- EQUALS_MATCH: G['torch'].long == torch.int64
	| | | +- GuardManager: source=G['torch'].stack, accessed_by=GetAttrGuardAccessor(stack)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].stack, 139845228799024)
	| | | +- GuardManager: source=G['torch'].arange, accessed_by=GetAttrGuardAccessor(arange)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].arange, 139845228706960)
	| | | +- GuardManager: source=G['torch'].einsum, accessed_by=GetAttrGuardAccessor(einsum)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].einsum, 139842415911568)
	| | | +- GuardManager: source=G['torch'].tensor, accessed_by=GetAttrGuardAccessor(tensor)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].tensor, 139845228703840)
	| | | +- GuardManager: source=G['torch'].minimum, accessed_by=GetAttrGuardAccessor(minimum)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].minimum, 139845228824272)
	| | | +- GuardManager: source=G['torch'].transpose, accessed_by=GetAttrGuardAccessor(transpose)
	| | | | +- ID_MATCH: ___check_obj_id(G['torch'].transpose, 139845228736688)
	| | +- GuardManager: source=G['__builtins_dict___46'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___46)
	| | | +- GuardManager: source=G['__builtins_dict___46']['len'], accessed_by=DictGetItemGuardAccessor(len)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['len'], 139845257826832)
	| | | +- GuardManager: source=G['__builtins_dict___46']['zip'], accessed_by=DictGetItemGuardAccessor(zip)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['zip'], 7491872)
	| | | +- GuardManager: source=G['__builtins_dict___46']['range'], accessed_by=DictGetItemGuardAccessor(range)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___46']['range'], 7632448)
	| | +- GuardManager: source=G['__import_torch_dot__dynamo_dot_polyfill'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot__dynamo_dot_polyfill)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot__dynamo_dot_polyfill'], 139839728158336)
	| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']

V0627 17:31:09.568000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "15/0", "frame_key": "20", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 583, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 58, "shape_env_guard_count": 0, "graph_op_count": 208, "graph_node_count": 228, "graph_input_count": 19, "start_time": 1719534664.260442, "entire_frame_compile_time_s": 5.308261871337891, "backend_compile_time_s": 5.101780414581299, "inductor_compile_time_s": 4.007972240447998, "code_gen_time_s": 3.5389716625213623, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 15, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.580000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 40, "size": 2555904}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 12, 64], "is_leaf": true, "stride": [638976, 64, 53248, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e311dbd80>", "describer_id": 40}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.582000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 40, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.587000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 12, 64], "contiguous": [1, 832, 12, 64], "context_layer": [1, 832, 768]}}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "153b3dc8bb7ea7326b02a24531cf2b23"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_0_: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
	        l_stack0_0_ = L_stack0_0_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
	        contiguous: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l_stack0_0_.contiguous();  l_stack0_0_ = None
	        context_layer: "f32[1, 832, 768][638976, 768, 1]cpu" = contiguous.view(1, 832, -1);  contiguous = None
	        return (context_layer,)

V0627 17:31:09.599000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "4088b7608c41845b848a0fa539961d1e"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
	        clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format);  arg0_1 = None
	        view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(clone, [1, 832, -1]);  clone = None
	        return (view,)

V0627 17:31:09.609000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "33da1fe849e643eaf3458df62aaeea7e"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:495 in torch_dynamo_resume_in_forward_at_472, code: context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
	        clone: "f32[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.clone.default(arg0_1, memory_format = torch.contiguous_format);  arg0_1 = None
	        view: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.reshape.default(clone, [1, 832, -1]);  clone = None
	        return (view,)

V0627 17:31:09.703000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/7l/c7lltvlss5l4w5dsp4k3kpmjg6nemqpgb5mrjqqw2csgjbuvtav3.py"}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "675b3bf5875d915c125bff4b02eb31f4"}

	# AOT ID: ['9_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()


	cpp_fused_clone_0 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const float* in_ptr0,
	                       float* out_ptr0)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
	            {
	                #pragma GCC ivdep
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(12L); x1+=static_cast<long>(1L))
	                {
	                    for(long x2=static_cast<long>(0L); x2<static_cast<long>(64L); x2+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x2 + (64L*x0) + (53248L*x1)), 16);
	                        tmp0.store(out_ptr0 + static_cast<long>(x2 + (64L*x1) + (768L*x0)));
	                    }
	                }
	            }
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg0_1, = args
	    args.clear()
	    assert_size_stride(arg0_1, (1, 832, 12, 64), (638976, 64, 53248, 1))
	    buf0 = empty_strided_cpu((1, 832, 12, 64), (638976, 768, 64, 1), torch.float32)
	    cpp_fused_clone_0(arg0_1, buf0)
	    del arg0_1
	    return (reinterpret_tensor(buf0, (1, 832, 768), (638976, 768, 1), 0), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    arg0_1 = rand_strided((1, 832, 12, 64), (638976, 64, 53248, 1), device='cpu', dtype=torch.float32)
	    fn = lambda: call([arg0_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:09.710000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:09.710000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0, "has_payload": "832b6bdf2f2092cb0e2ca7f3e3a30237"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
	| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
	| | +- LENGTH_CHECK: len(L['___stack0']) == 2
	| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
	| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 12, 64], stride=[638976, 64, 53248, 1])
	| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
	| | +- GuardManager: source=L['___stack0'][1], accessed_by=TupleGetItemGuardAccessor(1)
	| | | +- ID_MATCH: ___check_obj_id(L['___stack0'][1], 7636800)
	| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
	| | +- EQUALS_MATCH: L['batch_size'] == 1
	| +- GuardManager: source=L['from_seq_length'], accessed_by=DictGetItemGuardAccessor(from_seq_length)
	| | +- EQUALS_MATCH: L['from_seq_length'] == 832
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)

V0627 17:31:09.710000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "16/0", "frame_key": "21", "co_name": "torch_dynamo_resume_in_forward_at_472", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 472, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 11, "shape_env_guard_count": 0, "graph_op_count": 2, "graph_node_count": 4, "graph_input_count": 1, "start_time": 1719534669.5804062, "entire_frame_compile_time_s": 0.13004136085510254, "backend_compile_time_s": 0.12020564079284668, "inductor_compile_time_s": 0.09919452667236328, "code_gen_time_s": 0.08350419998168945, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 16, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.711000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30911a30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.719000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 42, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 42, "size": 2555904}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6e30>", "describer_id": 42}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.724000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 42, "id": 4, "source": "L['hidden_states']"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.731000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "l_hidden_states_": [1, 832, 768], "hidden_states": [1, 832, 768], "hidden_states_1": [1, 832, 768], "add": [1, 832, 768], "hidden_states_2": [1, 832, 768]}}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "5cbaeaa3b94e9560f38738cbbbf2efd6"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu", L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        l_stack0_0_ = L_stack0_0_
	        l_hidden_states_ = L_hidden_states_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
	        hidden_states: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(l_stack0_0_);  l_stack0_0_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1317 in forward, code: hidden_states = self.dropout(hidden_states)
	        hidden_states_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states);  hidden_states = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
	        add: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_1 + l_hidden_states_;  hidden_states_1 = l_hidden_states_ = None
	        hidden_states_2: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add);  add = None
	        return (hidden_states_2,)

V0627 17:31:09.771000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "1cd1232b8ea80a91453ce72d7309f42c"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768][1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
	        convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16);  arg1_1 = None
	        convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16);  arg0_1 = None
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16);  arg4_1 = None
	        view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]);  convert_element_type_2 = None
	        permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]);  convert_element_type_1 = None
	        addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute);  convert_element_type = view = permute = None
	        view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]);  addmm = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1317 in forward, code: hidden_states = self.dropout(hidden_states)
	        clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_1);  view_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
	        add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg5_1);  clone = arg5_1 = None
	        var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True)
	        getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
	        getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1];  var_mean = None
	        add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
	        rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1);  add_1 = None
	        sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1);  add = getitem_1 = None
	        mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
	        mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, arg2_1);  mul = arg2_1 = None
	        add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, arg3_1);  mul_1 = arg3_1 = None
	        return (add_2,)

V0627 17:31:09.822000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "e2b95d7d56d3ed2a8ad6cfb284f41613"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg4_1: "f32[1, 832, 768][638976, 768, 1]cpu", arg5_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # No stacktrace found for following nodes
	        _frozen_param2: "f32[768][1]cpu" = self._frozen_param2
	        _frozen_param3: "f32[768][1]cpu" = self._frozen_param3

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
	        _frozen_param4: "bf16[768][1]cpu" = self._frozen_param4

	        # No stacktrace found for following nodes
	        _frozen_param6: "bf16[768, 768][1, 0]cpu" = self._frozen_param6

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1316 in forward, code: hidden_states = self.dense(hidden_states)
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16);  arg4_1 = None
	        _linear_pointwise_default_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param6, _frozen_param4, 'none', [], '');  convert_element_type_2 = _frozen_param6 = _frozen_param4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1318 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
	        add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_1, arg5_1);  _linear_pointwise_default_1 = arg5_1 = None
	        var_mean = torch.ops.aten.var_mean.correction(add, [2], correction = 0, keepdim = True)
	        getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
	        getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1];  var_mean = None
	        sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add, getitem_1);  add = getitem_1 = None
	        add_1: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
	        rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_1);  add_1 = None
	        mul: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
	        mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul, _frozen_param2);  mul = _frozen_param2 = None
	        add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_1, _frozen_param3);  mul_1 = _frozen_param3 = None
	        return (add_2,)

V0627 17:31:09.919000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/ot/cotc6xdws22smodcitafp7uurqklfk4ux2ijtnzkqwktzn6c3wk3.py"}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "320320d26970537cad9fa4b92420ab78"}

	# AOT ID: ['10_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()
	_frozen_param2 = None  # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d44fe0
	_frozen_param3 = None  # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45080
	_frozen_param4 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e30928a90
	_frozen_param6 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e303a2cf0


	cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const float* in_ptr0,
	                       bfloat16* out_ptr0)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
	            {
	                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
	                auto tmp1 = at::vec::convert<bfloat16>(tmp0);
	                tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
	            }
	        }
	    }
	}
	''')


	cpp_fused_add_native_layer_norm_1 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       const float* in_ptr1,
	                       const float* in_ptr2,
	                       const float* in_ptr3,
	                       float* out_ptr0,
	                       float* out_ptr1,
	                       float* out_ptr2)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
	            {
	                {
	                    Welford<float> tmp_acc0 = Welford<float>();
	                    Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
	                    static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
	                        auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp3 = tmp1 + tmp2;
	                        tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps);
	                    }
	                    tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
	                    out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
	                    out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
	                    auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
	                    auto tmp4 = out_ptr0[static_cast<long>(x0)];
	                    auto tmp7 = out_ptr1[static_cast<long>(x0)];
	                    auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
	                    auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16);
	                    auto tmp1 = at::vec::convert<float>(tmp0);
	                    auto tmp3 = tmp1 + tmp2;
	                    auto tmp5 = at::vec::Vectorized<float>(tmp4);
	                    auto tmp6 = tmp3 - tmp5;
	                    auto tmp8 = static_cast<float>(768.0);
	                    auto tmp9 = tmp7 / tmp8;
	                    auto tmp10 = static_cast<float>(1e-12);
	                    auto tmp11 = decltype(tmp9)(tmp9 + tmp10);
	                    auto tmp12 = 1 / std::sqrt(tmp11);
	                    auto tmp13 = at::vec::Vectorized<float>(tmp12);
	                    auto tmp14 = tmp6 * tmp13;
	                    auto tmp16 = tmp14 * tmp15;
	                    auto tmp18 = tmp16 + tmp17;
	                    tmp18.store(out_ptr2 + static_cast<long>(x1 + (768L*x0)));
	                }
	            }
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg4_1, arg5_1 = args
	    args.clear()
	    assert_size_stride(arg4_1, (1, 832, 768), (638976, 768, 1))
	    assert_size_stride(arg5_1, (1, 832, 768), (638976, 768, 1))
	    buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
	    cpp_fused__to_copy_0(arg4_1, buf0)
	    del arg4_1
	    buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param6, _frozen_param4, 'none', [-1], '')
	    del buf0
	    buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
	    buf3 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
	    buf5 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
	    cpp_fused_add_native_layer_norm_1(buf1, arg5_1, _frozen_param2, _frozen_param3, buf2, buf3, buf5)
	    del arg5_1
	    return (buf5, )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    global _frozen_param2
	    _frozen_param2 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
	    global _frozen_param3
	    _frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
	    global _frozen_param4
	    _frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param6
	    _frozen_param6 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    arg4_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
	    arg5_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
	    fn = lambda: call([arg4_1, arg5_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:09.931000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:09.932000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0, "has_payload": "cc9600447bc28ad3ba928d7719c0654d"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202275632)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272272)
	| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__)
	| | | | | | +- GuardManager: source=L['self'].output.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202271456)
	| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].output.dense.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202271168)
	| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].output.dropout.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202271504)
	| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
	| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
	| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
	| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
	| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
	| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
	| | +- LENGTH_CHECK: len(L['___stack0']) == 1
	| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
	| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
	| | | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states'])
	| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
	| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['___stack0'][0], L['hidden_states'])
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
	| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
	| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks

V0627 17:31:09.932000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "17/0", "frame_key": "22", "co_name": "torch_dynamo_resume_in_forward_at_1401", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1401, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 34, "shape_env_guard_count": 0, "graph_op_count": 4, "graph_node_count": 7, "graph_input_count": 2, "start_time": 1719534669.711534, "entire_frame_compile_time_s": 0.22069621086120605, "backend_compile_time_s": 0.1933588981628418, "inductor_compile_time_s": 0.11173701286315918, "code_gen_time_s": 0.08121824264526367, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 17, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.933000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 44, "size": 2555904}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e309a5760>", "describer_id": 44}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.935000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 44, "id": 0, "source": "L['___stack0'][0]"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:09.984000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [1, 832, 768], "hidden_states": [1, 832, 3072], "mul": [1, 832, 3072], "pow_1": [1, 832, 3072], "mul_1": [1, 832, 3072], "add": [1, 832, 3072], "mul_2": [1, 832, 3072], "tanh": [1, 832, 3072], "add_1": [1, 832, 3072], "hidden_states_1": [1, 832, 3072], "hidden_states_2": [1, 832, 768], "hidden_states_3": [1, 832, 768], "add_2": [1, 832, 768], "hidden_states_4": [1, 832, 768]}}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "28432eb8c22b77d39d8eae55f0796aec"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_0_: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        l_stack0_0_ = L_stack0_0_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
	        hidden_states: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = self.L__self___intermediate_dense(l_stack0_0_)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
	        mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.5 * hidden_states
	        pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.pow(hidden_states, 3.0)
	        mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.044715 * pow_1;  pow_1 = None
	        add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = hidden_states + mul_1;  hidden_states = mul_1 = None
	        mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 0.7978845608028654 * add;  add = None
	        tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.tanh(mul_2);  mul_2 = None
	        add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = 1.0 + tanh;  tanh = None
	        hidden_states_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = mul * add_1;  mul = add_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
	        hidden_states_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dense(hidden_states_1);  hidden_states_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1436 in forward, code: hidden_states = self.dropout(hidden_states)
	        hidden_states_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_dropout(hidden_states_2);  hidden_states_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
	        add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = hidden_states_3 + l_stack0_0_;  hidden_states_3 = l_stack0_0_ = None
	        hidden_states_4: "f32[1, 832, 768][638976, 768, 1]cpu" = self.L__self___output_LayerNorm(add_2);  add_2 = None
	        return (hidden_states_4,)

V0627 17:31:10.051000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "c94939d327a02b378b1745a04171ca4e"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[3072, 768][768, 1]cpu", arg1_1: "f32[3072][1]cpu", arg2_1: "f32[768, 3072][3072, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768][1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
	        convert_element_type: "bf16[3072][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16);  arg1_1 = None
	        convert_element_type_1: "bf16[3072, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16);  arg0_1 = None
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
	        view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]);  convert_element_type_2 = None
	        permute: "bf16[768, 3072][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]);  convert_element_type_1 = None
	        addmm: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute);  convert_element_type = view = permute = None
	        view_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 3072]);  addmm = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
	        mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 0.5)
	        pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(view_1, 3.0)
	        mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715);  pow_1 = None
	        add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(view_1, mul_1);  view_1 = mul_1 = None
	        mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654);  add = None
	        tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2);  mul_2 = None
	        add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0);  tanh = None
	        mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1);  mul = add_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
	        convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16);  arg3_1 = None
	        convert_element_type_7: "bf16[768, 3072][3072, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16);  arg2_1 = None
	        view_2: "bf16[832, 3072][3072, 1]cpu" = torch.ops.aten.view.default(mul_3, [832, 3072]);  mul_3 = None
	        permute_1: "bf16[3072, 768][1, 3072]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]);  convert_element_type_7 = None
	        addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_2, permute_1);  convert_element_type_6 = view_2 = permute_1 = None
	        view_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]);  addmm_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1436 in forward, code: hidden_states = self.dropout(hidden_states)
	        clone: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.clone.default(view_3);  view_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
	        add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(clone, arg6_1);  clone = arg6_1 = None
	        var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True)
	        getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
	        getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1];  var_mean = None
	        add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
	        rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3);  add_3 = None
	        sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1);  add_2 = getitem_1 = None
	        mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
	        mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, arg4_1);  mul_4 = arg4_1 = None
	        add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, arg5_1);  mul_5 = arg5_1 = None
	        return (add_4,)

V0627 17:31:10.133000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "8400618ae53b7968980ef85788f68b83"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # No stacktrace found for following nodes
	        _frozen_param4: "f32[768][1]cpu" = self._frozen_param4
	        _frozen_param5: "f32[768][1]cpu" = self._frozen_param5

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
	        _frozen_param6: "bf16[3072][1]cpu" = self._frozen_param6

	        # No stacktrace found for following nodes
	        _frozen_param10: "bf16[3072, 768][1, 0]cpu" = self._frozen_param10

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
	        _frozen_param8: "bf16[768][1]cpu" = self._frozen_param8

	        # No stacktrace found for following nodes
	        _frozen_param11: "bf16[768, 3072][1, 0]cpu" = self._frozen_param11

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1421 in forward, code: hidden_states = self.dense(hidden_states)
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
	        _linear_pointwise_default_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param10, _frozen_param6, 'none', [], '');  convert_element_type_2 = _frozen_param10 = _frozen_param6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/activations.py:57 in forward, code: return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
	        mul: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(_linear_pointwise_default_3, 0.5)
	        pow_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.pow.Tensor_Scalar(_linear_pointwise_default_3, 3.0)
	        mul_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(pow_1, 0.044715);  pow_1 = None
	        add: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_3, mul_1);  _linear_pointwise_default_3 = mul_1 = None
	        mul_2: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(add, 0.7978845608028654);  add = None
	        tanh: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.tanh.default(mul_2);  mul_2 = None
	        add_1: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.add.Tensor(tanh, 1.0);  tanh = None
	        mul_3: "bf16[1, 832, 3072][2555904, 3072, 1]cpu" = torch.ops.aten.mul.Tensor(mul, add_1);  mul = add_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1435 in forward, code: hidden_states = self.dense(hidden_states)
	        _linear_pointwise_default_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(mul_3, _frozen_param11, _frozen_param8, 'none', [], '');  mul_3 = _frozen_param11 = _frozen_param8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1437 in forward, code: hidden_states = self.LayerNorm(hidden_states + input_tensor)
	        add_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(_linear_pointwise_default_2, arg6_1);  _linear_pointwise_default_2 = arg6_1 = None
	        var_mean = torch.ops.aten.var_mean.correction(add_2, [2], correction = 0, keepdim = True)
	        getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
	        getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1];  var_mean = None
	        sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, getitem_1);  add_2 = getitem_1 = None
	        add_3: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
	        rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_3);  add_3 = None
	        mul_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
	        mul_5: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_4, _frozen_param4);  mul_4 = _frozen_param4 = None
	        add_4: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, _frozen_param5);  mul_5 = _frozen_param5 = None
	        return (add_4,)

V0627 17:31:10.240000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/yq/cyqi5vcdu2onzw25fkzgawphp3sm6xov6rt4wwjoshykrnlnqms3.py"}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "220b8ade00d54ed30a9ebc3492a6ee4d"}

	# AOT ID: ['11_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()
	_frozen_param4 = None  # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45300
	_frozen_param5 = None  # device(type='cpu') torch.float32 (768,) (1,) 7f2eb1d45350
	_frozen_param6 = None  # device(type='cpu') torch.bfloat16 (3072,) (1,) 7f2e301a7600
	_frozen_param10 = None  # device(type='cpu') torch.bfloat16 (3072, 768) (1, 0) 7f2e3013c8b0
	_frozen_param8 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e3013f830
	_frozen_param11 = None  # device(type='cpu') torch.bfloat16 (768, 3072) (1, 0) 7f2e3013c2c0


	cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const float* in_ptr0,
	                       bfloat16* out_ptr0)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
	            {
	                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
	                auto tmp1 = at::vec::convert<bfloat16>(tmp0);
	                tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
	            }
	        }
	    }
	}
	''')


	cpp_fused_add_mul_pow_tanh_1 = async_compile.cpp_pybinding(['bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(bfloat16* in_out_ptr0)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(2555904L); x0+=static_cast<long>(16L))
	            {
	                auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_out_ptr0 + static_cast<long>(x0), 16);
	                auto tmp1 = at::vec::convert<float>(tmp0);
	                auto tmp2 = static_cast<float>(0.5);
	                auto tmp3 = at::vec::Vectorized<float>(tmp2);
	                auto tmp4 = tmp1 * tmp3;
	                auto tmp5 = tmp1 * tmp1;
	                auto tmp6 = tmp5 * tmp1;
	                auto tmp7 = static_cast<float>(0.044715);
	                auto tmp8 = at::vec::Vectorized<float>(tmp7);
	                auto tmp9 = tmp6 * tmp8;
	                auto tmp10 = tmp1 + tmp9;
	                auto tmp11 = static_cast<float>(0.7978845608028654);
	                auto tmp12 = at::vec::Vectorized<float>(tmp11);
	                auto tmp13 = tmp10 * tmp12;
	                auto tmp14 = decltype(tmp13)(2) / (decltype(tmp13)(1) + (decltype(tmp13)(-2) * tmp13).exp()) - decltype(tmp13)(1);
	                auto tmp15 = static_cast<float>(1.0);
	                auto tmp16 = at::vec::Vectorized<float>(tmp15);
	                auto tmp17 = tmp14 + tmp16;
	                auto tmp18 = tmp4 * tmp17;
	                auto tmp19 = at::vec::convert<bfloat16>(tmp18);
	                tmp19.store(in_out_ptr0 + static_cast<long>(x0), 16);
	            }
	        }
	    }
	}
	''')


	cpp_fused_add_native_layer_norm_2 = async_compile.cpp_pybinding(['const bfloat16*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const bfloat16* in_ptr0,
	                       const float* in_ptr1,
	                       const float* in_ptr2,
	                       const float* in_ptr3,
	                       float* out_ptr0,
	                       float* out_ptr1,
	                       float* out_ptr2)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
	            {
	                {
	                    Welford<float> tmp_acc0 = Welford<float>();
	                    Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
	                    static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
	                    for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
	                    {
	                        auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
	                        auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
	                        auto tmp1 = at::vec::convert<float>(tmp0);
	                        auto tmp3 = tmp1 + tmp2;
	                        tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp3, &weight_recps);
	                    }
	                    tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
	                    out_ptr0[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
	                    out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
	                }
	                for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
	                {
	                    auto tmp0 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
	                    auto tmp2 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*x0)), 16);
	                    auto tmp4 = out_ptr0[static_cast<long>(x0)];
	                    auto tmp7 = out_ptr1[static_cast<long>(x0)];
	                    auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1), 16);
	                    auto tmp17 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1), 16);
	                    auto tmp1 = at::vec::convert<float>(tmp0);
	                    auto tmp3 = tmp1 + tmp2;
	                    auto tmp5 = at::vec::Vectorized<float>(tmp4);
	                    auto tmp6 = tmp3 - tmp5;
	                    auto tmp8 = static_cast<float>(768.0);
	                    auto tmp9 = tmp7 / tmp8;
	                    auto tmp10 = static_cast<float>(1e-12);
	                    auto tmp11 = decltype(tmp9)(tmp9 + tmp10);
	                    auto tmp12 = 1 / std::sqrt(tmp11);
	                    auto tmp13 = at::vec::Vectorized<float>(tmp12);
	                    auto tmp14 = tmp6 * tmp13;
	                    auto tmp16 = tmp14 * tmp15;
	                    auto tmp18 = tmp16 + tmp17;
	                    tmp18.store(out_ptr2 + static_cast<long>(x1 + (768L*x0)));
	                }
	            }
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg6_1, = args
	    args.clear()
	    assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
	    buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
	    cpp_fused__to_copy_0(arg6_1, buf0)
	    buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param10, _frozen_param6, 'none', [-1], '')
	    del buf0
	    buf2 = buf1; del buf1  # reuse
	    cpp_fused_add_mul_pow_tanh_1(buf2)
	    buf3 = torch.ops.mkldnn._linear_pointwise(buf2, _frozen_param11, _frozen_param8, 'none', [-1], '')
	    del buf2
	    buf4 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
	    buf5 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
	    buf7 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
	    cpp_fused_add_native_layer_norm_2(buf3, arg6_1, _frozen_param4, _frozen_param5, buf4, buf5, buf7)
	    del arg6_1
	    return (buf7, )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    global _frozen_param4
	    _frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
	    global _frozen_param5
	    _frozen_param5 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
	    global _frozen_param6
	    _frozen_param6 = rand_strided((3072, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param10
	    _frozen_param10 = rand_strided((3072, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param8
	    _frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param11
	    _frozen_param11 = rand_strided((768, 3072), (1, 0), device='cpu', dtype=torch.bfloat16)
	    arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
	    fn = lambda: call([arg6_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:10.257000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:10.258000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0, "has_payload": "f3efa14ea8c088430fc033af17fce04d"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202276400)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].output, accessed_by=DictGetItemGuardAccessor(output)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].output, 139839202272320)
	| | | | | +- GuardManager: source=L['self'].output.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].output.__dict__)
	| | | | | | +- GuardManager: source=L['self'].output.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].output._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | | | | +- GuardManager: source=L['self'].output.dense, accessed_by=DictGetItemGuardAccessor(dense)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense, 139839202267808)
	| | | | | | | | +- GuardManager: source=L['self'].output.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].output.dense.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dense.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].output.dropout, accessed_by=DictGetItemGuardAccessor(dropout)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout, 139839202268288)
	| | | | | | | | +- GuardManager: source=L['self'].output.dropout.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].output.dropout.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.dropout.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].output.LayerNorm, accessed_by=DictGetItemGuardAccessor(LayerNorm)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm, 139839202268912)
	| | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].output.LayerNorm.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].output.LayerNorm.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].output._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
	| | | | | | +- GuardManager: source=L['self'].output._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
	| | | | | | +- GuardManager: source=L['self'].output._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
	| | | | | | +- GuardManager: source=L['self'].output._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
	| | | | +- GuardManager: source=L['self'].intermediate, accessed_by=DictGetItemGuardAccessor(intermediate)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate, 139839202275440)
	| | | | | +- GuardManager: source=L['self'].intermediate.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.__dict__)
	| | | | | | +- GuardManager: source=L['self'].intermediate.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.training, 7685824)
	| | | | | | +- GuardManager: source=L['self'].intermediate._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | | | | +- GuardManager: source=L['self'].intermediate.dense, accessed_by=DictGetItemGuardAccessor(dense)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.dense, 139839202270544)
	| | | | | | | | +- GuardManager: source=L['self'].intermediate.dense.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- GuardManager: source=L['self'].intermediate.dense.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.dense.training, 7685824)
	| | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn, accessed_by=DictGetItemGuardAccessor(intermediate_act_fn)
	| | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.intermediate_act_fn, 139839202267616)
	| | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | | | | +- DICT_CONTAINS: not ___dict_contains('forward', L['self'].intermediate.intermediate_act_fn.__dict__)
	| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].intermediate.intermediate_act_fn.training, 7685824)
	| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
	| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
	| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
	| | | | | | | | | +- GuardManager: source=L['self'].intermediate.intermediate_act_fn._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
	| | | | | | +- GuardManager: source=L['self'].intermediate._forward_hooks, accessed_by=DictGetItemGuardAccessor(_forward_hooks)
	| | | | | | +- GuardManager: source=L['self'].intermediate._backward_hooks, accessed_by=DictGetItemGuardAccessor(_backward_hooks)
	| | | | | | +- GuardManager: source=L['self'].intermediate._forward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_forward_pre_hooks)
	| | | | | | +- GuardManager: source=L['self'].intermediate._backward_pre_hooks, accessed_by=DictGetItemGuardAccessor(_backward_pre_hooks)
	| | | +- GuardManager: source=L['self'].is_decoder, accessed_by=DictGetItemGuardAccessor(is_decoder)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].is_decoder, 7685824)
	| | | +- GuardManager: source=L['self'].seq_len_dim, accessed_by=DictGetItemGuardAccessor(seq_len_dim)
	| | | | +- EQUALS_MATCH: L['self'].seq_len_dim == 1
	| | | +- GuardManager: source=L['self'].chunk_size_feed_forward, accessed_by=DictGetItemGuardAccessor(chunk_size_feed_forward)
	| | | | +- EQUALS_MATCH: L['self'].chunk_size_feed_forward == 0
	| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
	| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7625984)
	| | +- LENGTH_CHECK: len(L['___stack0']) == 1
	| | +- GuardManager: source=L['___stack0'][0], accessed_by=TupleGetItemGuardAccessor(0)
	| | | +- TENSOR_MATCH: check_tensor(L['___stack0'][0], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | | +- NO_HASATTR: hasattr(L['___stack0'][0], '_dynamo_dynamic_indices') == False
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['apply_chunking_to_forward'], accessed_by=DictGetItemGuardAccessor(apply_chunking_to_forward)
	| | | +- GuardManager: source=G['apply_chunking_to_forward'].__code__, accessed_by=GetAttrGuardAccessor(__code__)
	| | | | +- ID_MATCH: ___check_obj_id(G['apply_chunking_to_forward'].__code__, 139839646455872)
	| | +- GuardManager: source=G['__builtins_dict___52'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___52)
	| | | +- GuardManager: source=G['__builtins_dict___52']['len'], accessed_by=DictGetItemGuardAccessor(len)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___52']['len'], 139845257826832)
	| | +- GuardManager: source=G['__import_transformers_dot_activations'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_activations)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'], 139839665031744)
	| | | +- GuardManager: source=G['__import_transformers_dot_activations'].math, accessed_by=GetAttrGuardAccessor(math)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math, 139845236089744)
	| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.pi, accessed_by=GetAttrGuardAccessor(pi)
	| | | | | +- EQUALS_MATCH: G['__import_transformers_dot_activations'].math.pi == 3.141592653589793
	| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].math.sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].math.sqrt, 139845236093344)
	| | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch, accessed_by=GetAttrGuardAccessor(torch)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch, 139845236322800)
	| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.pow, accessed_by=GetAttrGuardAccessor(pow)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.pow, 139845228824512)
	| | | | +- GuardManager: source=G['__import_transformers_dot_activations'].torch.tanh, accessed_by=GetAttrGuardAccessor(tanh)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_activations'].torch.tanh, 139845228799744)
	| | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'], accessed_by=DictGetItemGuardAccessor(__import_transformers_dot_pytorch_utils)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'], 139839703287984)
	| | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect, accessed_by=GetAttrGuardAccessor(inspect)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect, 139845236517488)
	| | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature, accessed_by=GetAttrGuardAccessor(signature)
	| | | | | +- GuardManager: source=G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, accessed_by=GetAttrGuardAccessor(__code__)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_transformers_dot_pytorch_utils'].inspect.signature.__code__, 139845231798640)
	| | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'], accessed_by=DictGetItemGuardAccessor(__import_torch_dot_nn_dot_modules_dot_module)
	| | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'], 139842442598640)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch, accessed_by=GetAttrGuardAccessor(torch)
	| | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch, 139845236322800)
	| | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, accessed_by=GetAttrGuardAccessor(_C)
	| | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C, 139845228547104)
	| | | | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, accessed_by=GetAttrGuardAccessor(_get_tracing_state)
	| | | | | | +- ID_MATCH: ___check_obj_id(G['__import_torch_dot_nn_dot_modules_dot_module'].torch._C._get_tracing_state, 139842451895088)
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_forward_pre_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_forward_pre_hooks
	| | | +- GuardManager: source=G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, accessed_by=GetAttrGuardAccessor(_global_backward_pre_hooks)
	| | | | +- TYPE_MATCH: ___check_type_id(G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks, 7497696)
	| | | | +- DICT_LENGTH: not G['__import_torch_dot_nn_dot_modules_dot_module']._global_backward_pre_hooks

V0627 17:31:10.259000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "18/0", "frame_key": "23", "co_name": "torch_dynamo_resume_in_forward_at_1488", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1488, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 69, "shape_env_guard_count": 0, "graph_op_count": 13, "graph_node_count": 15, "graph_input_count": 1, "start_time": 1719534669.9335542, "entire_frame_compile_time_s": 0.3254525661468506, "backend_compile_time_s": 0.26067519187927246, "inductor_compile_time_s": 0.1273505687713623, "code_gen_time_s": 0.07860469818115234, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 18, "frame_compile_id": 0, "attempt": 0}
V0627 17:31:10.262000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 46, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.270000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 0, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 46, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.271000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 1, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 46, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.273000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.275000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 46}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.275000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 46, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.306000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 47, "size": 2555904}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.307000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.307000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 0, "source": "L['hidden_states']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 47, "size": 442368}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.308000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 1, "source": "L['band_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 47, "size": 3328}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.309000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 2, "source": "L['from_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.310000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.311000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 4, "source": "L['to_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.312000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 47}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.312000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 47, "id": 5, "source": "L['blocked_encoder_mask']"}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.317000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:10.317000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1, "has_payload": "9d228664307649151c1145ad228290a7"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202274768)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].attention, accessed_by=DictGetItemGuardAccessor(attention)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention, 139839202265168)
	| | | | | +- GuardManager: source=L['self'].attention.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].attention.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].attention.training, 7685824)
	| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
	| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
	| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
	| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
	| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
	| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['head_mask'], accessed_by=DictGetItemGuardAccessor(head_mask)
	| | +- ID_MATCH: ___check_obj_id(L['head_mask'], 7636800)
	| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
	| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['attention_mask'], accessed_by=DictGetItemGuardAccessor(attention_mask)
	| | +- ID_MATCH: ___check_obj_id(L['attention_mask'], 7636800)
	| +- GuardManager: source=L['past_key_value'], accessed_by=DictGetItemGuardAccessor(past_key_value)
	| | +- ID_MATCH: ___check_obj_id(L['past_key_value'], 7636800)
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=L['blocked_encoder_mask'], accessed_by=DictGetItemGuardAccessor(blocked_encoder_mask)
	| | +- TENSOR_MATCH: check_tensor(L['blocked_encoder_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
	| | +- NO_HASATTR: hasattr(L['blocked_encoder_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['blocked_encoder_mask'])
	| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
	| +- GuardManager: source=L['encoder_attention_mask'], accessed_by=DictGetItemGuardAccessor(encoder_attention_mask)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_attention_mask'], 7636800)

V0627 17:31:10.318000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "7/1", "frame_key": "24", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1472, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 18, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 5, "graph_input_count": 5, "start_time": 1719534670.2629929, "entire_frame_compile_time_s": 0.05506253242492676, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.04132270812988281, "has_guarded_code": true}, "frame_id": 7, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.318000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 48, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.320000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 48, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.321000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 48, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.323000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.325000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 48}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.325000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 48, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 49, "size": 442368}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.355000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 0, "source": "L['band_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 49, "size": 2555904}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.356000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 1, "source": "L['hidden_states']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.357000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 49, "size": 3328}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.357000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.358000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.358000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 2, "source": "L['from_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.360000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.360000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 4, "source": "L['to_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.362000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 2, "base": 3, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 49}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.362000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 49, "id": 5, "source": "L['from_blocked_mask']"}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.365000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "l_to_mask_": [1, 1, 1, 832], "band_mask": [1, 1, 9, 64, 192], "from_mask": [1, 1, 832, 1], "to_mask": [1, 1, 1, 832]}}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "b5eca5f100188f494b5033015854eb4e"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
	        l_band_mask_ = L_band_mask_
	        l_from_mask_ = L_from_mask_
	        l_to_mask_ = L_to_mask_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1383 in forward, code: band_mask = band_mask.to(hidden_states.dtype)
	        band_mask: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = l_band_mask_.to(torch.float32);  l_band_mask_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1385 in forward, code: from_mask = from_mask.to(hidden_states.dtype)
	        from_mask: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = l_from_mask_.to(torch.float32);  l_from_mask_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1387 in forward, code: to_mask = to_mask.to(hidden_states.dtype)
	        to_mask: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = l_to_mask_.to(torch.float32);  l_to_mask_ = None
	        return (band_mask, from_mask, to_mask)

V0627 17:31:10.380000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "3ad949bb5c0c76a73cad2e99f5c9ebe2"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg1_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu", arg2_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu"):
	        return (arg0_1, arg1_1, arg2_1)

V0627 17:31:10.392000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:10.392000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1, "has_payload": "78c6200e495d09cd995b82c1e530d62e"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202265168)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].self, accessed_by=DictGetItemGuardAccessor(self)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].self, 139839202264976)
	| | | | | +- GuardManager: source=L['self'].self.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].self.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].self.training, 7685824)
	| | | +- GuardManager: source=L['self'].attention_type, accessed_by=DictGetItemGuardAccessor(attention_type)
	| | | | +- EQUALS_MATCH: L['self'].attention_type == 'block_sparse'
	| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
	| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
	| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
	| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
	| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
	| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
	| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
	| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=L['encoder_hidden_states'], accessed_by=DictGetItemGuardAccessor(encoder_hidden_states)
	| | +- ID_MATCH: ___check_obj_id(L['encoder_hidden_states'], 7636800)
	| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']

V0627 17:31:10.392000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "8/1", "frame_key": "25", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1365, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 16, "shape_env_guard_count": 0, "graph_op_count": 3, "graph_node_count": 7, "graph_input_count": 3, "start_time": 1719534670.3189635, "entire_frame_compile_time_s": 0.07366013526916504, "backend_compile_time_s": 0.02211451530456543, "inductor_compile_time_s": 0.00025773048400878906, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.03475379943847656, "has_guarded_code": true}, "frame_id": 8, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.393000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 51, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 51}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.395000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 51, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 52, "size": 2555904}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832, 768], "is_leaf": true, "stride": [638976, 768, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e301592b0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.419000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 0, "source": "L['hidden_states']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 52, "size": 442368}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.434000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 7, "source": "L['band_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 52, "size": 3328}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.435000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 8, "source": "L['from_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.437000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.437000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 10, "source": "L['to_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.438000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 8, "base": 9, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 52}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.438000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 52, "id": 11, "source": "L['from_blocked_mask']"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.441000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_hidden_states_": [1, 832, 768], "l__self___query": [1, 832, 768], "x": [1, 832, 12, 64], "query_layer": [1, 12, 832, 64], "l__self___key": [1, 832, 768], "x_1": [1, 832, 12, 64], "key_layer": [1, 12, 832, 64], "l__self___value": [1, 832, 768], "x_2": [1, 832, 12, 64], "value_layer": [1, 12, 832, 64]}}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "0aeb326082c3dce6efb144a844a46bdf"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_hidden_states_: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        l_hidden_states_ = L_hidden_states_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        l__self___query: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___query(l_hidden_states_)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        x: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___query.view(1, 832, 12, 64);  l__self___query = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        query_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x.permute(0, 2, 1, 3);  x = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        l__self___key: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___key(l_hidden_states_)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        x_1: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___key.view(1, 832, 12, 64);  l__self___key = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        key_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_1.permute(0, 2, 1, 3);  x_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        l__self___value: "bf16[1, 832, 768][638976, 768, 1]cpu" = self.L__self___value(l_hidden_states_);  l_hidden_states_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        x_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = l__self___value.view(1, 832, 12, 64);  l__self___value = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        value_layer: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = x_2.permute(0, 2, 1, 3);  x_2 = None
	        return (query_layer, key_layer, value_layer)

V0627 17:31:10.491000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "d91c82abb65a6c80f96ee652ae86f63d"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "f32[768, 768][768, 1]cpu", arg1_1: "f32[768][1]cpu", arg2_1: "f32[768, 768][768, 1]cpu", arg3_1: "f32[768][1]cpu", arg4_1: "f32[768, 768][768, 1]cpu", arg5_1: "f32[768][1]cpu", arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        convert_element_type: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg1_1, torch.bfloat16);  arg1_1 = None
	        convert_element_type_1: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg0_1, torch.bfloat16);  arg0_1 = None
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
	        view: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_2, [832, 768]);  convert_element_type_2 = None
	        permute: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_1, [1, 0]);  convert_element_type_1 = None
	        addmm: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type, view, permute);  convert_element_type = view = permute = None
	        view_1: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm, [1, 832, 768]);  addmm = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_1, [1, 832, 12, 64]);  view_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]);  view_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        convert_element_type_6: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg3_1, torch.bfloat16);  arg3_1 = None
	        convert_element_type_7: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg2_1, torch.bfloat16);  arg2_1 = None
	        convert_element_type_8: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16)
	        view_3: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_8, [832, 768]);  convert_element_type_8 = None
	        permute_2: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_7, [1, 0]);  convert_element_type_7 = None
	        addmm_1: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_6, view_3, permute_2);  convert_element_type_6 = view_3 = permute_2 = None
	        view_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_1, [1, 832, 768]);  addmm_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_4, [1, 832, 12, 64]);  view_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]);  view_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        convert_element_type_12: "bf16[768][1]cpu" = torch.ops.prims.convert_element_type.default(arg5_1, torch.bfloat16);  arg5_1 = None
	        convert_element_type_13: "bf16[768, 768][768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg4_1, torch.bfloat16);  arg4_1 = None
	        convert_element_type_14: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16);  arg6_1 = None
	        view_6: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.view.default(convert_element_type_14, [832, 768]);  convert_element_type_14 = None
	        permute_4: "bf16[768, 768][1, 768]cpu" = torch.ops.aten.permute.default(convert_element_type_13, [1, 0]);  convert_element_type_13 = None
	        addmm_2: "bf16[832, 768][768, 1]cpu" = torch.ops.aten.addmm.default(convert_element_type_12, view_6, permute_4);  convert_element_type_12 = view_6 = permute_4 = None
	        view_7: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.view.default(addmm_2, [1, 832, 768]);  addmm_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.view.default(view_7, [1, 832, 12, 64]);  view_7 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]);  view_8 = None
	        return (permute_1, permute_3, permute_5)

V0627 17:31:10.557000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "1df7e432445d21ccb26ae7f35b4ee86e"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        _frozen_param6: "bf16[768][1]cpu" = self._frozen_param6

	        # No stacktrace found for following nodes
	        _frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        _frozen_param8: "bf16[768][1]cpu" = self._frozen_param8

	        # No stacktrace found for following nodes
	        _frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        _frozen_param10: "bf16[768][1]cpu" = self._frozen_param10

	        # No stacktrace found for following nodes
	        _frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
	        convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16);  arg6_1 = None
	        _linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], '');  _frozen_param12 = _frozen_param6 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]);  _linear_pointwise_default_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]);  view_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
	        _linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], '');  _frozen_param13 = _frozen_param8 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]);  _linear_pointwise_default_4 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]);  view_5 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
	        _linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], '');  convert_element_type_2 = _frozen_param14 = _frozen_param10 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
	        view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]);  _linear_pointwise_default_3 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
	        permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]);  view_8 = None
	        return (permute_1, permute_3, permute_5)

V0627 17:31:10.578000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/wm/cwm7ec52zxt6bl7gt2h7sahtj5wsw4g7ez4jvozekjwtw7nqdl3v.py"}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "c068758cb8977ae26fcf611c09070a9a"}

	# AOT ID: ['13_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()
	_frozen_param6 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300d5490
	_frozen_param12 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300daed0
	_frozen_param8 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a0c70
	_frozen_param13 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300a3e70
	_frozen_param10 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7f2e300a1490
	_frozen_param14 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7f2e300dbfb0


	cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
	#include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
	extern "C" void kernel(const float* in_ptr0,
	                       bfloat16* out_ptr0)
	{
	    #pragma omp parallel num_threads(56)
	    {
	        int tid = omp_get_thread_num();
	        {
	            #pragma omp for
	            for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
	            {
	                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
	                auto tmp1 = at::vec::convert<bfloat16>(tmp0);
	                tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
	            }
	        }
	    }
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg6_1, = args
	    args.clear()
	    assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
	    buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
	    cpp_fused__to_copy_0(arg6_1, buf0)
	    del arg6_1
	    buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
	    buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
	    buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
	    return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    global _frozen_param6
	    _frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param12
	    _frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param8
	    _frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param13
	    _frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param10
	    _frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
	    global _frozen_param14
	    _frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
	    arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
	    fn = lambda: call([arg6_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:10.587000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:10.588000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1, "has_payload": "da04fa8fdd18f2f15ae08b9dbbb492e0"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| | | +- GuardManager: source=L['self']._modules, accessed_by=DictGetItemGuardAccessor(_modules)
	| | | | +- GuardManager: source=L['self'].key, accessed_by=DictGetItemGuardAccessor(key)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].key, 139839202265648)
	| | | | | +- GuardManager: source=L['self'].key.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].key.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].key.training, 7685824)
	| | | | +- GuardManager: source=L['self'].query, accessed_by=DictGetItemGuardAccessor(query)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].query, 139839202265696)
	| | | | | +- GuardManager: source=L['self'].query.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].query.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].query.training, 7685824)
	| | | | +- GuardManager: source=L['self'].value, accessed_by=DictGetItemGuardAccessor(value)
	| | | | | +- ID_MATCH: ___check_obj_id(L['self'].value, 139839202264592)
	| | | | | +- GuardManager: source=L['self'].value.__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | | | | +- GuardManager: source=L['self'].value.training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | | | | +- ID_MATCH: ___check_obj_id(L['self'].value.training, 7685824)
	| | | +- GuardManager: source=L['self'].seed, accessed_by=DictGetItemGuardAccessor(seed)
	| | | | +- EQUALS_MATCH: L['self'].seed == 1
	| | | +- GuardManager: source=L['self'].block_size, accessed_by=DictGetItemGuardAccessor(block_size)
	| | | | +- EQUALS_MATCH: L['self'].block_size == 64
	| | | +- GuardManager: source=L['self'].num_random_blocks, accessed_by=DictGetItemGuardAccessor(num_random_blocks)
	| | | | +- EQUALS_MATCH: L['self'].num_random_blocks == 3
	| | | +- GuardManager: source=L['self'].attention_head_size, accessed_by=DictGetItemGuardAccessor(attention_head_size)
	| | | | +- EQUALS_MATCH: L['self'].attention_head_size == 64
	| | | +- GuardManager: source=L['self'].num_attention_heads, accessed_by=DictGetItemGuardAccessor(num_attention_heads)
	| | | | +- EQUALS_MATCH: L['self'].num_attention_heads == 12
	| +- GuardManager: source=L['to_mask'], accessed_by=DictGetItemGuardAccessor(to_mask)
	| | +- TENSOR_MATCH: check_tensor(L['to_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 1, 832], stride=[832, 832, 832, 1])
	| | +- NO_HASATTR: hasattr(L['to_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['band_mask'], accessed_by=DictGetItemGuardAccessor(band_mask)
	| | +- TENSOR_MATCH: check_tensor(L['band_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 9, 64, 192], stride=[110592, 110592, 12288, 192, 1])
	| | +- NO_HASATTR: hasattr(L['band_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_mask'], accessed_by=DictGetItemGuardAccessor(from_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 1, 832, 1], stride=[832, 832, 1, 1])
	| | +- NO_HASATTR: hasattr(L['from_mask'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['hidden_states'], accessed_by=DictGetItemGuardAccessor(hidden_states)
	| | +- TENSOR_MATCH: check_tensor(L['hidden_states'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 832, 768], stride=[638976, 768, 1])
	| | +- NO_HASATTR: hasattr(L['hidden_states'], '_dynamo_dynamic_indices') == False
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['from_blocked_mask'], accessed_by=DictGetItemGuardAccessor(from_blocked_mask)
	| | +- TENSOR_MATCH: check_tensor(L['from_blocked_mask'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.float32, device=None, requires_grad=False, size=[1, 13, 64], stride=[832, 64, 1])
	| | +- NO_HASATTR: hasattr(L['from_blocked_mask'], '_dynamo_dynamic_indices') == False
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']
	| | +- NO_TENSOR_ALIASING: check_no_aliasing(L['to_mask'], L['band_mask'], L['from_mask'], L['hidden_states'], L['from_blocked_mask'])
	| +- GuardManager: source=L['output_attentions'], accessed_by=DictGetItemGuardAccessor(output_attentions)
	| | +- ID_MATCH: ___check_obj_id(L['output_attentions'], 7685824)
	| +- GuardManager: source=L['to_blocked_mask'], accessed_by=DictGetItemGuardAccessor(to_blocked_mask)
	| | +- TENSOR_ALIASING: L['from_blocked_mask'] is L['to_blocked_mask']

V0627 17:31:10.588000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "9/1", "frame_key": "26", "co_name": "forward", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 446, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 21, "shape_env_guard_count": 0, "graph_op_count": 9, "graph_node_count": 11, "graph_input_count": 1, "start_time": 1719534670.3936255, "entire_frame_compile_time_s": 0.19471240043640137, "backend_compile_time_s": 0.1402432918548584, "inductor_compile_time_s": 0.033010005950927734, "code_gen_time_s": 0.012862920761108398, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.024178743362426758, "has_guarded_code": true}, "frame_id": 9, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.589000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 10, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.614000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:10.614000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1, "has_payload": "1ea07e64f0c0d490d94336fa323c05e9"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['seed'], accessed_by=DictGetItemGuardAccessor(seed)
	| | +- EQUALS_MATCH: L['seed'] == 1
	| +- GuardManager: source=L['batch_size'], accessed_by=DictGetItemGuardAccessor(batch_size)
	| | +- EQUALS_MATCH: L['batch_size'] == 1
	| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
	| | +- EQUALS_MATCH: L['to_seq_len'] == 832
	| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
	| | +- EQUALS_MATCH: L['from_seq_len'] == 832
	| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
	| | +- EQUALS_MATCH: L['to_block_size'] == 64
	| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
	| | +- EQUALS_MATCH: L['from_block_size'] == 64
	| +- GuardManager: source=L['attention_head_size'], accessed_by=DictGetItemGuardAccessor(attention_head_size)
	| | +- EQUALS_MATCH: L['attention_head_size'] == 64
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['np'], accessed_by=DictGetItemGuardAccessor(np)
	| | | +- ID_MATCH: ___check_obj_id(G['np'], 139845228893488)
	| | | +- GuardManager: source=G['np'].random, accessed_by=GetAttrGuardAccessor(random)
	| | | | +- ID_MATCH: ___check_obj_id(G['np'].random, 139842452860464)
	| | | | +- GuardManager: source=G['np'].random.seed, accessed_by=GetAttrGuardAccessor(seed)
	| | | | | +- ID_MATCH: ___check_obj_id(G['np'].random.seed, 139842451129264)
	| | +- GuardManager: source=G['math'], accessed_by=DictGetItemGuardAccessor(math)
	| | | +- ID_MATCH: ___check_obj_id(G['math'], 139845236089744)
	| | | +- GuardManager: source=G['math'].sqrt, accessed_by=GetAttrGuardAccessor(sqrt)
	| | | | +- ID_MATCH: ___check_obj_id(G['math'].sqrt, 139845236093344)

V0627 17:31:10.615000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "10/1", "frame_key": "27", "co_name": "bigbird_block_sparse_attention", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 516, "cache_size": 1, "accumulated_cache_size": 1, "guard_count": 17, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.5898829, "entire_frame_compile_time_s": 0.02506852149963379, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["Graph break due to unsupported builtin numpy.random.mtrand.seed. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/docs/main/notes/custom_operators.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph."], "dynamo_time_before_restart_s": 0.009800434112548828, "has_guarded_code": true}, "frame_id": 10, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.615000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 11, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.647000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:10.648000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1, "has_payload": "b6b8c289bd494c29f862b3959f02ec26"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| +- GuardManager: source=L['n_heads'], accessed_by=DictGetItemGuardAccessor(n_heads)
	| | +- EQUALS_MATCH: L['n_heads'] == 12
	| +- GuardManager: source=L['to_seq_len'], accessed_by=DictGetItemGuardAccessor(to_seq_len)
	| | +- EQUALS_MATCH: L['to_seq_len'] == 832
	| +- GuardManager: source=L['from_seq_len'], accessed_by=DictGetItemGuardAccessor(from_seq_len)
	| | +- EQUALS_MATCH: L['from_seq_len'] == 832
	| +- GuardManager: source=L['n_rand_blocks'], accessed_by=DictGetItemGuardAccessor(n_rand_blocks)
	| | +- EQUALS_MATCH: L['n_rand_blocks'] == 3
	| +- GuardManager: source=L['to_block_size'], accessed_by=DictGetItemGuardAccessor(to_block_size)
	| | +- EQUALS_MATCH: L['to_block_size'] == 64
	| +- GuardManager: source=L['from_block_size'], accessed_by=DictGetItemGuardAccessor(from_block_size)
	| | +- EQUALS_MATCH: L['from_block_size'] == 64
	| +- GuardManager: source=L['plan_from_length'], accessed_by=DictGetItemGuardAccessor(plan_from_length)
	| | +- ID_MATCH: ___check_obj_id(L['plan_from_length'], 7636800)
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['__builtins_dict___69'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___69)
	| | | +- GuardManager: source=G['__builtins_dict___69']['int'], accessed_by=DictGetItemGuardAccessor(int)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___69']['int'], 7648640)

V0627 17:31:10.648000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "11/1", "frame_key": "28", "co_name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 569, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 14, "shape_env_guard_count": 0, "graph_op_count": 0, "graph_node_count": 0, "graph_input_count": 0, "start_time": 1719534670.6159284, "entire_frame_compile_time_s": 0.03219175338745117, "backend_compile_time_s": null, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": ["data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True"], "dynamo_time_before_restart_s": 0.01743292808532715, "has_guarded_code": true}, "frame_id": 11, "frame_compile_id": 1, "attempt": 1}
V0627 17:31:10.649000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1165, "name": "_bigbird_block_rand_mask_with_head", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300921b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.653000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30091ee0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.655000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30092c00>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.657000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30152570>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.659000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0040>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.661000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0950>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.663000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc0680>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.665000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1530>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.667000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc1e90>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.669000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc2840>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.671000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.673000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.673000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc30b0>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.674000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.675000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 58, "size": 156}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.676000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [13, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3470>", "describer_id": 58}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.676000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 58, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.681000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [13, 3], "l_stack0_1_": [13, 3], "l_stack0_2_": [13, 3], "l_stack0_3_": [13, 3], "l_stack0_4_": [13, 3], "l_stack0_5_": [13, 3], "l_stack0_6_": [13, 3], "l_stack0_7_": [13, 3], "l_stack0_8_": [13, 3], "l_stack0_9_": [13, 3], "l_stack0_10_": [13, 3], "l_stack0_11_": [13, 3], "wrapped_getitem": [11, 3], "wrapped_getitem_1": [11, 3], "wrapped_getitem_2": [11, 3], "wrapped_getitem_3": [11, 3], "wrapped_getitem_4": [11, 3], "wrapped_getitem_5": [11, 3], "wrapped_getitem_6": [11, 3], "wrapped_getitem_7": [11, 3], "wrapped_getitem_8": [11, 3], "wrapped_getitem_9": [11, 3], "wrapped_getitem_10": [11, 3], "wrapped_getitem_11": [11, 3]}}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "17b857365a2979b2936469716bb70fcd"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_0_: "i32[13, 3][3, 1]cpu", L_stack0_1_: "i32[13, 3][3, 1]cpu", L_stack0_2_: "i32[13, 3][3, 1]cpu", L_stack0_3_: "i32[13, 3][3, 1]cpu", L_stack0_4_: "i32[13, 3][3, 1]cpu", L_stack0_5_: "i32[13, 3][3, 1]cpu", L_stack0_6_: "i32[13, 3][3, 1]cpu", L_stack0_7_: "i32[13, 3][3, 1]cpu", L_stack0_8_: "i32[13, 3][3, 1]cpu", L_stack0_9_: "i32[13, 3][3, 1]cpu", L_stack0_10_: "i32[13, 3][3, 1]cpu", L_stack0_11_: "i32[13, 3][3, 1]cpu"):
	        l_stack0_0_ = L_stack0_0_
	        l_stack0_1_ = L_stack0_1_
	        l_stack0_2_ = L_stack0_2_
	        l_stack0_3_ = L_stack0_3_
	        l_stack0_4_ = L_stack0_4_
	        l_stack0_5_ = L_stack0_5_
	        l_stack0_6_ = L_stack0_6_
	        l_stack0_7_ = L_stack0_7_
	        l_stack0_8_ = L_stack0_8_
	        l_stack0_9_ = L_stack0_9_
	        l_stack0_10_ = L_stack0_10_
	        l_stack0_11_ = L_stack0_11_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
	        wrapped_getitem: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem(l_stack0_0_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_0_ = None
	        wrapped_getitem_1: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_1(l_stack0_1_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_1_ = None
	        wrapped_getitem_2: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_2(l_stack0_2_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_2_ = None
	        wrapped_getitem_3: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_3(l_stack0_3_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_3_ = None
	        wrapped_getitem_4: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_4(l_stack0_4_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_4_ = None
	        wrapped_getitem_5: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_5(l_stack0_5_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_5_ = None
	        wrapped_getitem_6: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_6(l_stack0_6_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_6_ = None
	        wrapped_getitem_7: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_7(l_stack0_7_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_7_ = None
	        wrapped_getitem_8: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_8(l_stack0_8_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_8_ = None
	        wrapped_getitem_9: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_9(l_stack0_9_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_9_ = None
	        wrapped_getitem_10: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_10(l_stack0_10_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_10_ = None
	        wrapped_getitem_11: "i32[11, 3][3, 1]cpu" = torch__dynamo_utils_wrapped_getitem_11(l_stack0_11_, (slice(1, 12, None), slice(None, None, None)));  l_stack0_11_ = None
	        return (wrapped_getitem, wrapped_getitem_1, wrapped_getitem_2, wrapped_getitem_3, wrapped_getitem_4, wrapped_getitem_5, wrapped_getitem_6, wrapped_getitem_7, wrapped_getitem_8, wrapped_getitem_9, wrapped_getitem_10, wrapped_getitem_11)

V0627 17:31:10.751000 139845268738432 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:199] {"aot_forward_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "949e5e36955b193a43ad76da2f5a5f52"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
	        slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12);  arg0_1 = None
	        slice_2: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 9223372036854775807);  slice_1 = None
	        slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12);  arg1_1 = None
	        slice_4: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807);  slice_3 = None
	        slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12);  arg2_1 = None
	        slice_6: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_5, 1, 0, 9223372036854775807);  slice_5 = None
	        slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12);  arg3_1 = None
	        slice_8: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_7, 1, 0, 9223372036854775807);  slice_7 = None
	        slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12);  arg4_1 = None
	        slice_10: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_9, 1, 0, 9223372036854775807);  slice_9 = None
	        slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12);  arg5_1 = None
	        slice_12: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807);  slice_11 = None
	        slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12);  arg6_1 = None
	        slice_14: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_13, 1, 0, 9223372036854775807);  slice_13 = None
	        slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12);  arg7_1 = None
	        slice_16: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_15, 1, 0, 9223372036854775807);  slice_15 = None
	        slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12);  arg8_1 = None
	        slice_18: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_17, 1, 0, 9223372036854775807);  slice_17 = None
	        slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12);  arg9_1 = None
	        slice_20: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_19, 1, 0, 9223372036854775807);  slice_19 = None
	        slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12);  arg10_1 = None
	        slice_22: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_21, 1, 0, 9223372036854775807);  slice_21 = None
	        slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12);  arg11_1 = None
	        slice_24: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(slice_23, 1, 0, 9223372036854775807);  slice_23 = None
	        return (slice_2, slice_4, slice_6, slice_8, slice_10, slice_12, slice_14, slice_16, slice_18, slice_20, slice_22, slice_24)

V0627 17:31:10.788000 139845268738432 torch/_inductor/compile_fx.py:754] {"inductor_post_grad_graph": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "9c01c298863b324227937cc74dd4d962"}
	class <lambda>(torch.nn.Module):
	    def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
	        slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12);  arg0_1 = None
	        slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12);  arg1_1 = None
	        slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12);  arg2_1 = None
	        slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12);  arg3_1 = None
	        slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12);  arg4_1 = None
	        slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12);  arg5_1 = None
	        slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12);  arg6_1 = None
	        slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12);  arg7_1 = None
	        slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12);  arg8_1 = None
	        slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12);  arg9_1 = None
	        slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12);  arg10_1 = None
	        slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12);  arg11_1 = None
	        return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)

V0627 17:31:10.802000 139845268738432 torch/_inductor/graph.py:1693] {"inductor_output_code": {"filename": "/tmp/torchinductor_leslie/of/cof3htzjwffvxd2lla7sn2ozynci436rdmah5vsvllsahmxz6qro.py"}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "c06d796ae11c7e77048735efc71e26ca"}

	# AOT ID: ['14_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align

	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
	async_compile = AsyncCompile()


	async_compile.wait(globals())
	del async_compile

	def call(args):
	    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
	    args.clear()
	    assert_size_stride(arg0_1, (13, 3), (3, 1))
	    assert_size_stride(arg1_1, (13, 3), (3, 1))
	    assert_size_stride(arg2_1, (13, 3), (3, 1))
	    assert_size_stride(arg3_1, (13, 3), (3, 1))
	    assert_size_stride(arg4_1, (13, 3), (3, 1))
	    assert_size_stride(arg5_1, (13, 3), (3, 1))
	    assert_size_stride(arg6_1, (13, 3), (3, 1))
	    assert_size_stride(arg7_1, (13, 3), (3, 1))
	    assert_size_stride(arg8_1, (13, 3), (3, 1))
	    assert_size_stride(arg9_1, (13, 3), (3, 1))
	    assert_size_stride(arg10_1, (13, 3), (3, 1))
	    assert_size_stride(arg11_1, (13, 3), (3, 1))
	    return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )


	def benchmark_compiled_module(times=10, repeat=10):
	    from torch._dynamo.testing import rand_strided
	    from torch._inductor.utils import print_performance
	    arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
	    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
	    return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	    from torch._inductor.wrapper_benchmark import compiled_module_main
	    compiled_module_main('hf_BigBird', benchmark_compiled_module)

V0627 17:31:10.810000 139845268738432 torch/_dynamo/guards.py:2317] {"dynamo_guards": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "18b50eaa01e860d2c78d96b8478bfd75"}
	[

	]
V0627 17:31:10.810000 139845268738432 torch/_dynamo/guards.py:2145] {"dynamo_cpp_guards_str": {}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0, "has_payload": "6e13f24b700fd79116617b1177bb6706"}

	TREE_GUARD_MANAGER:
	+- RootGuardManager
	| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:460 in init_ambient_guards
	| +- GLOBAL_STATE: ___check_global_state()
	| +- GuardManager: source=L['self'], accessed_by=DictGetItemGuardAccessor(self)
	| | +- ID_MATCH: ___check_obj_id(L['self'], 139839202264976)
	| | +- GuardManager: source=L['self'].__dict__, accessed_by=GetGenericDictGuardAccessor
	| | | +- GuardManager: source=L['self'].training, accessed_by=DictGetItemGuardAccessor(training)
	| | | | +- ID_MATCH: ___check_obj_id(L['self'].training, 7685824)
	| +- GuardManager: source=L['___stack0'], accessed_by=DictGetItemGuardAccessor(___stack0)
	| | +- TYPE_MATCH: ___check_type_id(L['___stack0'], 7650400)
	| | +- LENGTH_CHECK: len(L['___stack0']) == 12
	| | +- GuardManager: source=L['___stack0'][0], accessed_by=ListGetItemGuardAccessor(0)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][0]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][0]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][0]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][1], accessed_by=ListGetItemGuardAccessor(1)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][1]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][1]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][1]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][2], accessed_by=ListGetItemGuardAccessor(2)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][2]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][2]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][2]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][3], accessed_by=ListGetItemGuardAccessor(3)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][3]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][3]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][3]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][4], accessed_by=ListGetItemGuardAccessor(4)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][4]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][4]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][4]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][5], accessed_by=ListGetItemGuardAccessor(5)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][5]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][5]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][5]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][6], accessed_by=ListGetItemGuardAccessor(6)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][6]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][6]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][6]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][7], accessed_by=ListGetItemGuardAccessor(7)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][7]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][7]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][7]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][8], accessed_by=ListGetItemGuardAccessor(8)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][8]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][8]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][8]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][9], accessed_by=ListGetItemGuardAccessor(9)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][9]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][9]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][9]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][10], accessed_by=ListGetItemGuardAccessor(10)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][10]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][10]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][10]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| | +- GuardManager: source=L['___stack0'][11], accessed_by=ListGetItemGuardAccessor(11)
	| | | +- GuardManager: source=___from_numpy(L['___stack0'][11]), accessed_by=PythonLambdaGuardAccessor
	| | | | +- TENSOR_MATCH: check_tensor(___from_numpy(L['___stack0'][11]), Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU, AutocastCPU), torch.int32, device=None, requires_grad=False, size=[13, 3], stride=[3, 1])
	| | | | +- NO_HASATTR: hasattr(___from_numpy(L['___stack0'][11]), '_dynamo_dynamic_indices') == False
	| | | | +- NO_TENSOR_ALIASING: check_no_aliasing(___from_numpy(L['___stack0'][0]), ___from_numpy(L['___stack0'][1]), ___from_numpy(L['___stack0'][2]), ___from_numpy(L['___stack0'][3]), ___from_numpy(L['___stack0'][4]), ___from_numpy(L['___stack0'][5]), ___from_numpy(L['___stack0'][6]), ___from_numpy(L['___stack0'][7]), ___from_numpy(L['___stack0'][8]), ___from_numpy(L['___stack0'][9]), ___from_numpy(L['___stack0'][10]), ___from_numpy(L['___stack0'][11]))
	| +- GuardManager: source=L['num_heads'], accessed_by=DictGetItemGuardAccessor(num_heads)
	| | +- EQUALS_MATCH: L['num_heads'] == 12
	| +- GuardManager: source=L['num_blocks'], accessed_by=DictGetItemGuardAccessor(num_blocks)
	| | +- EQUALS_MATCH: L['num_blocks'] == 13
	| +- GuardManager: source=L['global_block_top'], accessed_by=DictGetItemGuardAccessor(global_block_top)
	| | +- EQUALS_MATCH: L['global_block_top'] == 1
	| +- GuardManager: source=L['global_block_bottom'], accessed_by=DictGetItemGuardAccessor(global_block_bottom)
	| | +- EQUALS_MATCH: L['global_block_bottom'] == 1
	| +- GuardManager: source=G, accessed_by=GlobalsGuardAccessor
	| | +- GuardManager: source=G['__builtins_dict___71'], accessed_by=DictGetItemGuardAccessor(__builtins_dict___71)
	| | | +- GuardManager: source=G['__builtins_dict___71']['range'], accessed_by=DictGetItemGuardAccessor(range)
	| | | | +- ID_MATCH: ___check_obj_id(G['__builtins_dict___71']['range'], 7632448)

V0627 17:31:10.810000 139845268738432 torch/_dynamo/utils.py:719] {"compilation_metrics": {"compile_id": "14/1", "frame_key": "29", "co_name": "torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165", "co_filename": "/localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py", "co_firstlineno": 1165, "cache_size": 0, "accumulated_cache_size": 1, "guard_count": 25, "shape_env_guard_count": 0, "graph_op_count": 12, "graph_node_count": 25, "graph_input_count": 12, "start_time": 1719534670.6493185, "entire_frame_compile_time_s": 0.16145634651184082, "backend_compile_time_s": 0.12227082252502441, "inductor_compile_time_s": 0.022518634796142578, "code_gen_time_s": 0.0035479068756103516, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": [], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "has_guarded_code": true}, "frame_id": 14, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.811000 139845268738432 torch/_dynamo/convert_frame.py:802] {"dynamo_start": {"stack": [{"line": 456, "name": "<module>", "filename": 0}, {"line": 452, "name": "torchbench_main", "filename": 0}, {"line": 3661, "name": "main", "filename": 1}, {"line": 3593, "name": "process_entry", "filename": 1}, {"line": 4220, "name": "run", "filename": 1}, {"line": 2965, "name": "run_one_model", "filename": 1}, {"line": 2805, "name": "run_performance_test", "filename": 1}, {"line": 2742, "name": "warmup", "filename": 1}, {"line": 433, "name": "_fn", "filename": 2}, {"line": 430, "name": "forward_pass", "filename": 0}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2450, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 2077, "name": "forward", "filename": 5}, {"line": 2133, "name": "torch_dynamo_resume_in_forward_at_2077", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1631, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1488, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 1401, "name": "forward", "filename": 5}, {"line": 1553, "name": "_wrapped_call_impl", "filename": 4}, {"line": 1562, "name": "_call_impl", "filename": 4}, {"line": 472, "name": "forward", "filename": 5}, {"line": 569, "name": "bigbird_block_sparse_attention", "filename": 5}, {"line": 583, "name": "torch_dynamo_resume_in_bigbird_block_sparse_attention_at_569", "filename": 5}, {"line": 1116, "name": "__call__", "filename": 3}]}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 0, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 0, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff64360>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.818000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 0, "source": "___from_numpy(L['___stack0'][0])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 1, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 1, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5e020>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.819000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 1, "source": "___from_numpy(L['___stack0'][1])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 2, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 2, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff5f5b0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.820000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 2, "source": "___from_numpy(L['___stack0'][2])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 3, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 3, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 3, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300934c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.821000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 3, "source": "___from_numpy(L['___stack0'][3])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 4, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 4, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 4, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff46cf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.822000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 4, "source": "___from_numpy(L['___stack0'][4])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 5, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 5, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 5, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff44d60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 5, "source": "___from_numpy(L['___stack0'][5])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.823000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 6, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 6, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 6, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffb1a80>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 6, "source": "___from_numpy(L['___stack0'][6])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.824000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 7, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 7, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 7, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e300dbbf0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 7, "source": "___from_numpy(L['___stack0'][7])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 8, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.825000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 8, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30090bd0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 8, "source": "___from_numpy(L['___stack0'][8])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 9, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 9, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 9, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ff67920>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.826000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 9, "source": "___from_numpy(L['___stack0'][9])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 10, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 10, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 10, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc3010>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.827000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 10, "source": "___from_numpy(L['___stack0'][10])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 11, "describer_id": 60, "size": 132}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 11, "ndim": 2, "dtype": "torch.int32", "device": "device(type='cpu')", "size": [11, 3], "is_leaf": true, "stride": [3, 1], "storage": 11, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e2ffc20c0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.828000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 11, "source": "___from_numpy(L['___stack0'][11])"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 12, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 12, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 12, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014c20>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.830000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 12, "source": "L['query_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.833000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 13, "describer_id": 60, "size": 3328}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 14, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 832], "is_leaf": true, "stride": [832, 1], "storage": 13, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31fff6a0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 13, "ndim": 3, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 13, 64], "is_leaf": true, "is_view": true, "stride": [832, 64, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317fbec0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.834000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 13, "source": "L['from_blocked_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 14, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 15, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 14, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30014220>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.840000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 15, "source": "L['key_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.841000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 15, "describer_id": 60, "size": 1277952}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.842000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 16, "ndim": 4, "dtype": "torch.bfloat16", "device": "device(type='cpu')", "size": [1, 12, 832, 64], "is_leaf": true, "stride": [638976, 64, 768, 1], "storage": 15, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e30015d50>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.842000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 16, "source": "L['value_layer']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.856000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 17, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 1, 832], "is_leaf": true, "is_view": true, "stride": [832, 832, 832, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e317f85e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.856000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 17, "source": "L['to_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:198] {"describe_storage": {"id": 16, "describer_id": 60, "size": 442368}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 18, "ndim": 5, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 9, 64, 192], "is_leaf": true, "stride": [110592, 110592, 12288, 192, 1], "storage": 16, "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e315ec0e0>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.893000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 18, "source": "L['band_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.936000 139845268738432 torch/_subclasses/meta_utils.py:396] {"describe_tensor": {"id": 19, "ndim": 4, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1, 1, 832, 1], "is_leaf": true, "is_view": true, "stride": [832, 832, 1, 1], "storage": 13, "base": 14, "creation_meta": "CreationMeta.NO_GRAD_MODE", "view_func": "<built-in method _view_func_unsafe of Tensor object at 0x7f2e31ea6b60>", "describer_id": 60}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.936000 139845268738432 torch/_subclasses/meta_utils.py:1601] {"describe_source": {"describer_id": 60, "id": 19, "source": "L['from_mask']"}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0}
V0627 17:31:10.952000 139845268738432 torch/_dynamo/output_graph.py:1295] {"dynamo_output_graph": {"sizes": {"l_stack0_0_": [11, 3], "l_stack0_1_": [11, 3], "l_stack0_2_": [11, 3], "l_stack0_3_": [11, 3], "l_stack0_4_": [11, 3], "l_stack0_5_": [11, 3], "l_stack0_6_": [11, 3], "l_stack0_7_": [11, 3], "l_stack0_8_": [11, 3], "l_stack0_9_": [11, 3], "l_stack0_10_": [11, 3], "l_stack0_11_": [11, 3], "l_query_layer_": [1, 12, 832, 64], "l_from_blocked_mask_": [1, 13, 64], "l_key_layer_": [1, 12, 832, 64], "l_value_layer_": [1, 12, 832, 64], "l_to_mask_": [1, 1, 1, 832], "l_band_mask_": [1, 1, 9, 64, 192], "l_from_mask_": [1, 1, 832, 1], "rand_attn": [12, 11, 3], "rand_attn_1": [1, 12, 11, 3], "unsqueeze_": [1, 12, 11, 3], "rand_attn_2": [1, 12, 11, 3], "p1": [13, 64], "i1": [12, 11, 3], "flatten": [396], "getitem_2": [396, 64], "rand_mask": [1, 396, 64], "rand_mask_1": [1, 12, 11, 192], "getitem_3": [1, 11, 64], "rand_mask_2": [1, 12, 11, 64, 192], "blocked_query_matrix": [1, 12, 13, 64, 64], "blocked_key_matrix": [1, 12, 13, 64, 64], "blocked_value_matrix": [1, 12, 13, 64, 64], "shift": [396], "div": [396], "indices_shift": [396], "view_4": [396], "flattened_indices": [396], "flattened_params": [156, 64, 64], "out_flattened": [396, 64, 64], "out": [1, 12, 33, 64, 64], "gathered_key": [1, 12, 11, 192, 64], "shift_1": [396], "div_1": [396], "indices_shift_1": [396], "view_6": [396], "flattened_indices_1": [396], "flattened_params_1": [156, 64, 64], "out_flattened_1": [396, 64, 64], "out_1": [1, 12, 33, 64, 64], "gathered_value": [1, 12, 11, 192, 64], "getitem_4": [1, 12, 64, 64], "reshape_4": [12, 64, 64], "reshape_5": [12, 832, 64], "transpose": [12, 64, 832], "bmm": [12, 64, 832], "first_product": [1, 12, 64, 832], "first_product_1": [1, 12, 64, 832], "sub": [1, 1, 1, 832], "mul_3": [1, 1, 1, 832], "first_product_2": [1, 12, 64, 832], "first_attn_weights": [1, 12, 64, 832], "reshape_6": [12, 64, 832], "reshape_7": [12, 832, 64], "bmm_1": [12, 64, 64], "first_context_layer": [1, 12, 1, 64, 64], "unsqueeze__1": [1, 12, 1, 64, 64], "getitem_5": [1, 12, 64, 64], "getitem_6": [1, 12, 64, 64], "getitem_7": [1, 12, 64, 64], "getitem_8": [1, 12, 64, 64], "getitem_9": [1, 12, 192, 64], "second_key_mat": [1, 12, 448, 64], "getitem_10": [1, 12, 64, 64], "getitem_11": [1, 12, 64, 64], "getitem_12": [1, 12, 64, 64], "getitem_13": [1, 12, 64, 64], "getitem_14": [1, 12, 192, 64], "second_value_mat": [1, 12, 448, 64], "getitem_15": [1, 12, 64, 64], "reshape_8": [12, 64, 64], "reshape_9": [12, 448, 64], "transpose_1": [12, 64, 448], "bmm_2": [12, 64, 448], "second_product": [1, 12, 64, 448], "getitem_16": [1, 1, 1, 192], "getitem_17": [1, 1, 1, 64], "new_ones": [1, 1, 1, 192], "second_seq_pad": [1, 1, 1, 448], "new_ones_1": [1, 12, 64, 256], "getitem_18": [1, 12, 64, 192], "second_rand_pad": [1, 12, 64, 448], "second_product_1": [1, 12, 64, 448], "minimum": [1, 12, 64, 448], "sub_1": [1, 12, 64, 448], "mul_5": [1, 12, 64, 448], "second_product_2": [1, 12, 64, 448], "second_attn_weights": [1, 12, 64, 448], "reshape_10": [12, 64, 448], "reshape_11": [12, 448, 64], "bmm_3": [12, 64, 64], "second_context_layer": [1, 12, 1, 64, 64], "unsqueeze__2": [1, 12, 1, 64, 64], "getitem_19": [1, 12, 9, 64, 64], "getitem_20": [1, 12, 9, 64, 64], "getitem_21": [1, 12, 9, 64, 64], "exp_blocked_key_matrix": [1, 12, 9, 192, 64], "getitem_22": [1, 12, 9, 64, 64], "getitem_23": [1, 12, 9, 64, 64], "getitem_24": [1, 12, 9, 64, 64], "exp_blocked_value_matrix": [1, 12, 9, 192, 64], "middle_query_matrix": [1, 12, 9, 64, 64], "reshape_12": [108, 64, 64], "reshape_13": [108, 192, 64], "transpose_2": [108, 64, 192], "bmm_4": [108, 64, 192], "inner_band_product": [1, 12, 9, 64, 192], "inner_band_product_1": [1, 12, 9, 64, 192], "getitem_26": [1, 12, 9, 192, 64], "reshape_14": [108, 64, 64], "reshape_15": [108, 192, 64], "transpose_3": [108, 64, 192], "bmm_5": [108, 64, 192], "rand_band_product": [1, 12, 9, 64, 192], "rand_band_product_1": [1, 12, 9, 64, 192], "getitem_27": [1, 12, 64, 64], "first_band_product": [1, 12, 9, 64, 64], "first_band_product_1": [1, 12, 9, 64, 64], "getitem_28": [1, 12, 64, 64], "last_band_product": [1, 12, 9, 64, 64], "last_band_product_1": [1, 12, 9, 64, 64], "sub_2": [1, 1, 9, 64, 192], "mul_10": [1, 1, 9, 64, 192], "inner_band_product_2": [1, 12, 9, 64, 192], "getitem_29": [1, 1, 1, 64], "unsqueeze": [1, 1, 1, 1, 64], "sub_3": [1, 1, 1, 1, 64], "mul_11": [1, 1, 1, 1, 64], "first_band_product_2": [1, 12, 9, 64, 64], "getitem_30": [1, 1, 1, 64], "unsqueeze_1": [1, 1, 1, 1, 64], "sub_4": [1, 1, 1, 1, 64], "mul_12": [1, 1, 1, 1, 64], "last_band_product_2": [1, 12, 9, 64, 64], "getitem_31": [1, 12, 9, 64, 192], "sub_5": [1, 12, 9, 64, 192], "mul_13": [1, 12, 9, 64, 192], "rand_band_product_2": [1, 12, 9, 64, 192], "band_product": [1, 12, 9, 64, 512], "attn_weights": [1, 12, 9, 64, 512], "getitem_32": [1, 12, 9, 64, 192], "reshape_16": [108, 64, 192], "reshape_17": [108, 192, 64], "bmm_6": [108, 64, 64], "context_layer": [1, 12, 9, 64, 64], "getitem_33": [1, 12, 9, 64, 192], "getitem_34": [1, 12, 9, 192, 64], "reshape_18": [108, 64, 192], "reshape_19": [108, 192, 64], "bmm_7": [108, 64, 64], "view_15": [1, 12, 9, 64, 64], "context_layer_1": [1, 12, 9, 64, 64], "getitem_35": [1, 12, 9, 64, 64], "getitem_36": [1, 12, 64, 64], "einsum_3": [1, 12, 9, 64, 64], "context_layer_2": [1, 12, 9, 64, 64], "getitem_37": [1, 12, 9, 64, 64], "getitem_38": [1, 12, 64, 64], "einsum_4": [1, 12, 9, 64, 64], "context_layer_3": [1, 12, 9, 64, 64], "getitem_39": [1, 12, 64, 64], "getitem_40": [1, 12, 64, 64], "getitem_41": [1, 12, 64, 64], "getitem_42": [1, 12, 64, 64], "getitem_43": [1, 12, 192, 64], "second_last_key_mat": [1, 12, 448, 64], "getitem_44": [1, 12, 64, 64], "getitem_45": [1, 12, 64, 64], "getitem_46": [1, 12, 64, 64], "getitem_47": [1, 12, 64, 64], "getitem_48": [1, 12, 192, 64], "second_last_value_mat": [1, 12, 448, 64], "getitem_49": [1, 12, 64, 64], "reshape_20": [12, 64, 64], "reshape_21": [12, 448, 64], "transpose_4": [12, 64, 448], "bmm_8": [12, 64, 448], "second_last_product": [1, 12, 64, 448], "getitem_50": [1, 1, 1, 64], "getitem_51": [1, 1, 1, 192], "new_ones_2": [1, 1, 1, 192], "second_last_seq_pad": [1, 1, 1, 448], "new_ones_3": [1, 12, 64, 256], "getitem_52": [1, 12, 64, 192], "second_last_rand_pad": [1, 12, 64, 448], "second_last_product_1": [1, 12, 64, 448], "minimum_1": [1, 12, 64, 448], "sub_6": [1, 12, 64, 448], "mul_15": [1, 12, 64, 448], "second_last_product_2": [1, 12, 64, 448], "second_last_attn_weights": [1, 12, 64, 448], "reshape_22": [12, 64, 448], "reshape_23": [12, 448, 64], "bmm_9": [12, 64, 64], "second_last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__3": [1, 12, 1, 64, 64], "getitem_53": [1, 12, 64, 64], "reshape_24": [12, 64, 64], "reshape_25": [12, 832, 64], "transpose_5": [12, 64, 832], "bmm_10": [12, 64, 832], "last_product": [1, 12, 64, 832], "last_product_1": [1, 12, 64, 832], "sub_7": [1, 1, 1, 832], "mul_17": [1, 1, 1, 832], "last_product_2": [1, 12, 64, 832], "last_attn_weights": [1, 12, 64, 832], "reshape_26": [12, 64, 832], "reshape_27": [12, 832, 64], "bmm_11": [12, 64, 64], "last_context_layer": [1, 12, 1, 64, 64], "unsqueeze__4": [1, 12, 1, 64, 64], "context_layer_4": [1, 12, 13, 64, 64], "view_20": [1, 12, 832, 64], "context_layer_5": [1, 12, 832, 64], "context_layer_6": [1, 832, 12, 64]}}, "frame_id": 15, "frame_compile_id": 1, "attempt": 0, "has_payload": "8c75f78cbe780240c3647a51d604b94a"}
	class GraphModule(torch.nn.Module):
	    def forward(self, L_stack0_0_: "i32[11, 3][3, 1]cpu", L_stack0_1_: "i32[11, 3][3, 1]cpu", L_stack0_2_: "i32[11, 3][3, 1]cpu", L_stack0_3_: "i32[11, 3][3, 1]cpu", L_stack0_4_: "i32[11, 3][3, 1]cpu", L_stack0_5_: "i32[11, 3][3, 1]cpu", L_stack0_6_: "i32[11, 3][3, 1]cpu", L_stack0_7_: "i32[11, 3][3, 1]cpu", L_stack0_8_: "i32[11, 3][3, 1]cpu", L_stack0_9_: "i32[11, 3][3, 1]cpu", L_stack0_10_: "i32[11, 3][3, 1]cpu", L_stack0_11_: "i32[11, 3][3, 1]cpu", L_query_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_from_blocked_mask_: "f32[1, 13, 64][832, 64, 1]cpu", L_key_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_value_layer_: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", L_to_mask_: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", L_band_mask_: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", L_from_mask_: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
	        l_stack0_0_ = L_stack0_0_
	        l_stack0_1_ = L_stack0_1_
	        l_stack0_2_ = L_stack0_2_
	        l_stack0_3_ = L_stack0_3_
	        l_stack0_4_ = L_stack0_4_
	        l_stack0_5_ = L_stack0_5_
	        l_stack0_6_ = L_stack0_6_
	        l_stack0_7_ = L_stack0_7_
	        l_stack0_8_ = L_stack0_8_
	        l_stack0_9_ = L_stack0_9_
	        l_stack0_10_ = L_stack0_10_
	        l_stack0_11_ = L_stack0_11_
	        l_query_layer_ = L_query_layer_
	        l_from_blocked_mask_ = L_from_blocked_mask_
	        l_key_layer_ = L_key_layer_
	        l_value_layer_ = L_value_layer_
	        l_to_mask_ = L_to_mask_
	        l_band_mask_ = L_band_mask_
	        l_from_mask_ = L_from_mask_

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
	        rand_attn: "i32[12, 11, 3][33, 3, 1]cpu" = torch__dynamo_utils_wrapped_stack([l_stack0_0_, l_stack0_1_, l_stack0_2_, l_stack0_3_, l_stack0_4_, l_stack0_5_, l_stack0_6_, l_stack0_7_, l_stack0_8_, l_stack0_9_, l_stack0_10_, l_stack0_11_], axis = 0);  l_stack0_0_ = l_stack0_1_ = l_stack0_2_ = l_stack0_3_ = l_stack0_4_ = l_stack0_5_ = l_stack0_6_ = l_stack0_7_ = l_stack0_8_ = l_stack0_9_ = l_stack0_10_ = l_stack0_11_ = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
	        rand_attn_1: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.tensor(rand_attn, device = device(type='cpu'), dtype = torch.int64);  rand_attn = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
	        unsqueeze_: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = rand_attn_1.unsqueeze_(0)

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:596 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0)
	        rand_attn_2: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.cat([rand_attn_1], dim = 0);  rand_attn_1 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        p1: "f32[13, 64][64, 1]cpu" = l_from_blocked_mask_[0]
	        i1: "i64[12, 11, 3][33, 3, 1]cpu" = rand_attn_2[0]

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        flatten: "i64[396][1]cpu" = i1.flatten();  i1 = None
	        getitem_2: "f32[396, 64][64, 1]cpu" = p1[flatten];  p1 = flatten = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
	        rand_mask: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.stack([getitem_2]);  getitem_2 = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
	        rand_mask_1: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = rand_mask.view(1, 12, 11, 192);  rand_mask = None

	        # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-pa

## before regression.log

      
    Raw
  

              before regression.log
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)