Created
April 16, 2024 00:57
-
-
Save msaroufim/f6d234f22428848cf23dbc0566e77130 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5 | |
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py: | |
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3 | |
/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:133: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance. | |
warnings.warn( | |
INFO:root:running build_ext | |
AUTOTUNE mm(4096x4096, 4096x32) | |
mm 0.0672 ms 100.0% | |
triton_mm_6 0.1812 ms 37.1% | |
triton_mm_9 0.1883 ms 35.7% | |
triton_mm_8 0.3150 ms 21.3% | |
triton_mm_3 0.3175 ms 21.2% | |
triton_mm_5 0.3179 ms 21.1% | |
triton_mm_11 0.3465 ms 19.4% | |
triton_mm_1 0.3610 ms 18.6% | |
triton_mm_2 0.3711 ms 18.1% | |
triton_mm_0 0.4112 ms 16.3% | |
SingleProcess AUTOTUNE benchmarking takes 5.3445 seconds and 0.0000 seconds precompiling | |
AUTOTUNE mm(4096x32, 32x4096) | |
triton_mm_12 0.0366 ms 100.0% | |
triton_mm_16 0.0407 ms 90.0% | |
triton_mm_13 0.0411 ms 89.1% | |
triton_mm_14 0.0418 ms 87.6% | |
triton_mm_22 0.0420 ms 87.3% | |
triton_mm_20 0.0437 ms 83.8% | |
triton_mm_15 0.0437 ms 83.8% | |
triton_mm_17 0.0516 ms 71.0% | |
triton_mm_19 0.0518 ms 70.7% | |
triton_mm_18 0.0520 ms 70.4% | |
SingleProcess AUTOTUNE benchmarking takes 5.9478 seconds and 0.0000 seconds precompiling | |
Traceback (most recent call last): | |
File "/home/marksaroufim/ao/benchmarks/bench_galore_fused_kernels.py", line 62, in <module> | |
run(args) | |
File "/home/marksaroufim/ao/benchmarks/bench_galore_fused_kernels.py", line 29, in run | |
benchmark.run(show_plots=False, print_data=True, save_path=save_path) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/triton/testing.py", line 343, in run | |
result_dfs.append(self._run(bench, save_path, show_plots, print_data, **kwargs)) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/triton/testing.py", line 286, in _run | |
ret = self.fn(**x_args, **{bench.line_arg: y}, **bench.args, **kwrags) | |
File "/home/marksaroufim/ao/benchmarks/fused_benchmark_utils.py", line 244, in benchmark | |
ms, min_ms, max_ms = triton.testing.do_bench( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/triton/testing.py", line 100, in do_bench | |
fn() | |
File "/home/marksaroufim/ao/benchmarks/fused_benchmark_utils.py", line 245, in <lambda> | |
lambda: compiled_op( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 410, in _fn | |
return fn(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 978, in catch_errors | |
return callback(frame, cache_entry, hooks, frame_state, skip=1) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 411, in _convert_frame_assert | |
return _compile( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_utils_internal.py", line 70, in wrapper_function | |
return function(*args, **kwargs) | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 700, in _compile | |
guarded_code = compile_inner(code, one_graph, hooks, transform) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 266, in time_wrapper | |
r = func(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 568, in compile_inner | |
out_code = transform_code_object(code, transform) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py", line 1116, in transform_code_object | |
transformations(instructions, code_options) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 173, in _fn | |
return fn(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 515, in transform | |
tracer.run() | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2237, in run | |
super().run() | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 875, in run | |
while self.step(): | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 790, in step | |
self.dispatch_table[inst.opcode](self, inst) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2380, in RETURN_VALUE | |
self._return(inst) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2365, in _return | |
self.output.compile_subgraph( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1075, in compile_subgraph | |
self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root) | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1264, in compile_and_call_fx_graph | |
compiled_fn = self.call_user_compiler(gm) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 266, in time_wrapper | |
r = func(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1331, in call_user_compiler | |
raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1312, in call_user_compiler | |
compiled_fn = compiler_fn(gm, self.example_inputs()) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py", line 127, in debug_wrapper | |
compiled_gm = compiler_fn(gm, example_inputs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py", line 127, in debug_wrapper | |
compiled_gm = compiler_fn(gm, example_inputs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/__init__.py", line 1742, in __call__ | |
return compile_fx(model_, inputs_, config_patches=self.config) | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1162, in compile_fx | |
return compile_fx( | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1398, in compile_fx | |
return aot_autograd( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/backends/common.py", line 65, in compiler_fn | |
cg = aot_module_simplified(gm, example_inputs, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 958, in aot_module_simplified | |
compiled_fn = create_aot_dispatcher_function( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 266, in time_wrapper | |
r = func(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 685, in create_aot_dispatcher_function | |
compiled_fn = compiler_fn( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 469, in aot_wrapper_dedupe | |
return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 671, in aot_wrapper_synthetic_base | |
return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 149, in aot_dispatch_base | |
compiled_fw = compiler(fw_module, updated_flat_args) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 266, in time_wrapper | |
r = func(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1311, in fw_compiler_base | |
return inner_compile( | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper | |
inner_compiled_fn = compiler_fn(gm, example_inputs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/debug.py", line 304, in inner | |
return fn(*args, **kwargs) | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/contextlib.py", line 79, in inner | |
return func(*args, **kwds) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 266, in time_wrapper | |
r = func(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 469, in compile_fx_inner | |
compiled_graph = fx_codegen_and_compile( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 746, in fx_codegen_and_compile | |
compiled_fn = graph.compile_to_fn() | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/graph.py", line 1449, in compile_to_fn | |
return self.compile_to_module().call | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 266, in time_wrapper | |
r = func(*args, **kwargs) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/graph.py", line 1396, in compile_to_module | |
mod = PyCodeCache.load_by_key_path( | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 2357, in load_by_key_path | |
exec(code, mod.__dict__, mod.__dict__) | |
File "/tmp/torchinductor_marksaroufim/5b/c5b2d44uzexjfh7wqv7qhsytwjez7w74t5wtdvfydezwof7ij6sh.py", line 233, in <module> | |
async_compile.wait(globals()) | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 2957, in wait | |
scope[key] = result.result() | |
File "/home/marksaroufim/.local/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 2755, in result | |
self.future.result() | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/concurrent/futures/_base.py", line 458, in result | |
return self.__get_result() | |
File "/home/marksaroufim/.conda/envs/ao/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result | |
raise self._exception | |
torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: | |
BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending. | |
Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information | |
You can suppress this exception and fall back to eager by setting: | |
import torch._dynamo | |
torch._dynamo.config.suppress_errors = True | |
(ao) [marksaroufim@devvm17057.vll0 ~/ao (galore_fused)]$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment