Skip to content

Instantly share code, notes, and snippets.

@philipturner
Last active June 14, 2024 18:15
Show Gist options
  • Save philipturner/45781f4515145106fc0d4e598dd5f13b to your computer and use it in GitHub Desktop.
Save philipturner/45781f4515145106fc0d4e598dd5f13b to your computer and use it in GitHub Desktop.
### M1 Max Statistics ###
0 - device store
1 - device store, with two-part load/store
2 - threadgroup store
3 - threadgroup store, with two-part load/store
FP32 (48x48x24)
problemSize = 976 | A B | 896 -> 896 -> 832 -> 832 | 7197 -> 7198 -> 7228 -> 7236 ( +1, +31, +39) GFLOPS
problemSize = 977 | A B | 832 -> 832 -> 768 -> 768 | 7027 -> 7040 -> 7060 -> 7083 ( +13, +33, +56) GFLOPS
problemSize = 978 | A B | 832 -> 832 -> 768 -> 768 | 7111 -> 7106 -> 7137 -> 7153 ( -5, +26, +42) GFLOPS
problemSize = 979 | A B | 832 -> 832 -> 768 -> 768 | 7121 -> 7160 -> 7174 -> 7175 ( +39, +53, +54) GFLOPS
problemSize = 980 | A B | 832 -> 832 -> 768 -> 768 | 7221 -> 7245 -> 7255 -> 7245 ( +24, +34, +24) GFLOPS
problemSize = 981 | A B | 832 -> 832 -> 768 -> 768 | 7236 -> 7234 -> 7248 -> 7243 ( -2, +12, +7) GFLOPS
problemSize = 982 | A B | 832 -> 832 -> 768 -> 768 | 7261 -> 7250 -> 7291 -> 7279 ( -11, +30, +18) GFLOPS
problemSize = 983 | A B | 832 -> 832 -> 768 -> 768 | 7244 -> 7260 -> 7316 -> 7305 ( +16, +72, +61) GFLOPS
problemSize = 984 | A B | 768 -> 768 -> 832 -> 832 | 7391 -> 7383 -> 7420 -> 7387 ( -8, +29, -4) GFLOPS
problemSize = 985 | A B | 896 -> 896 -> 832 -> 832 | 6841 -> 6827 -> 6883 -> 6926 ( -14, +42, +85) GFLOPS
problemSize = 986 | A B | 896 -> 896 -> 832 -> 832 | 6925 -> 6909 -> 7032 -> 6987 ( -16, +107, +62) GFLOPS
problemSize = 987 | A B | 896 -> 896 -> 832 -> 832 | 6916 -> 6922 -> 7009 -> 7015 ( +6, +93, +99) GFLOPS
problemSize = 988 | A B | 896 -> 896 -> 832 -> 832 | 7045 -> 7065 -> 7119 -> 7102 ( +20, +74, +57) GFLOPS
problemSize = 989 | A B | 896 -> 896 -> 832 -> 832 | 7004 -> 6997 -> 7103 -> 7069 ( -7, +99, +65) GFLOPS
problemSize = 990 | A B | 896 -> 896 -> 832 -> 832 | 7081 -> 7073 -> 7172 -> 7141 ( -8, +91, +60) GFLOPS
problemSize = 991 | A B | 896 -> 896 -> 832 -> 832 | 7069 -> 7058 -> 7163 -> 7157 ( -11, +94, +88) GFLOPS
problemSize = 992 | A B | 896 -> 896 -> 832 -> 832 | 7175 -> 7178 -> 7301 -> 7261 ( +3, +126, +86) GFLOPS
problemSize = 993 | A B | 896 -> 896 -> 832 -> 832 | 6980 -> 6955 -> 7024 -> 7038 ( -25, +44, +58) GFLOPS
problemSize = 994 | A B | 896 -> 896 -> 832 -> 832 | 7015 -> 7080 -> 7118 -> 7091 ( +65, +103, +76) GFLOPS
problemSize = 995 | A B | 896 -> 896 -> 832 -> 832 | 7059 -> 7056 -> 7124 -> 7115 ( -3, +65, +56) GFLOPS
problemSize = 996 | A B | 896 -> 896 -> 832 -> 832 | 7181 -> 7198 -> 7198 -> 7200 ( +17, +17, +19) GFLOPS
problemSize = 997 | A B | 896 -> 896 -> 832 -> 832 | 7154 -> 7185 -> 7191 -> 7185 ( +31, +37, +31) GFLOPS
problemSize = 998 | A B | 896 -> 896 -> 832 -> 832 | 7220 -> 7224 -> 7208 -> 7232 ( +4, -12, +12) GFLOPS
problemSize = 999 | A B | 896 -> 896 -> 832 -> 832 | 7229 -> 7218 -> 7281 -> 7247 ( -11, +52, +18) GFLOPS
problemSize = 1000 | A B | 896 -> 896 -> 832 -> 832 | 7341 -> 7304 -> 7395 -> 7378 ( -37, +54, +37) GFLOPS
problemSize = 1001 | A B | 832 -> 832 -> 768 -> 768 | 7171 -> 7124 -> 7129 -> 7170 ( -47, -42, -1) GFLOPS
problemSize = 1002 | A B | 832 -> 832 -> 768 -> 768 | 7189 -> 7182 -> 7244 -> 7229 ( -7, +55, +40) GFLOPS
problemSize = 1003 | A B | 832 -> 832 -> 768 -> 768 | 7207 -> 7194 -> 7263 -> 7227 ( -13, +56, +20) GFLOPS
problemSize = 1004 | A B | 832 -> 832 -> 768 -> 768 | 7311 -> 7310 -> 7333 -> 7324 ( -1, +22, +13) GFLOPS
problemSize = 1005 | A B | 832 -> 832 -> 768 -> 768 | 7284 -> 7296 -> 7325 -> 7291 ( +12, +41, +7) GFLOPS
problemSize = 1006 | A B | 832 -> 832 -> 768 -> 768 | 7366 -> 7390 -> 7357 -> 7363 ( +24, -9, -3) GFLOPS
problemSize = 1007 | A B | 832 -> 832 -> 768 -> 768 | 7300 -> 7304 -> 7373 -> 7395 ( +4, +73, +95) GFLOPS
problemSize = 1008 | A B | 768 -> 768 -> 832 -> 832 | 7503 -> 7484 -> 7484 -> 7508 ( -19, -19, +5) GFLOPS
problemSize = 1009 | A B | 896 -> 896 -> 832 -> 832 | 6855 -> 6854 -> 6916 -> 6915 ( -1, +61, +60) GFLOPS
problemSize = 1010 | A B | 896 -> 896 -> 832 -> 832 | 6919 -> 6949 -> 7026 -> 6997 ( +30, +107, +78) GFLOPS
problemSize = 1011 | A B | 896 -> 896 -> 832 -> 832 | 6979 -> 6958 -> 7031 -> 7031 ( -21, +52, +52) GFLOPS
problemSize = 1012 | A B | 896 -> 896 -> 832 -> 832 | 7073 -> 7063 -> 7110 -> 7100 ( -10, +37, +27) GFLOPS
problemSize = 1013 | A B | 896 -> 896 -> 832 -> 832 | 7051 -> 7053 -> 7108 -> 7106 ( +2, +57, +55) GFLOPS
problemSize = 1014 | A B | 896 -> 896 -> 832 -> 832 | 7116 -> 7115 -> 7155 -> 7138 ( -1, +39, +22) GFLOPS
problemSize = 1015 | A B | 896 -> 896 -> 832 -> 832 | 7123 -> 7134 -> 7167 -> 7153 ( +11, +44, +30) GFLOPS
problemSize = 1016 | A B | 896 -> 896 -> 832 -> 832 | 7174 -> 7157 -> 7240 -> 7242 ( -17, +66, +68) GFLOPS
problemSize = 1017 | A B | 896 -> 896 -> 832 -> 832 | 6949 -> 6945 -> 7036 -> 7046 ( -4, +87, +97) GFLOPS
problemSize = 1018 | A B | 896 -> 896 -> 832 -> 832 | 7023 -> 7050 -> 7087 -> 7086 ( +27, +64, +63) GFLOPS
problemSize = 1019 | A B | 896 -> 896 -> 832 -> 832 | 7064 -> 7058 -> 7130 -> 7117 ( -6, +66, +53) GFLOPS
problemSize = 1020 | A B | 896 -> 896 -> 832 -> 832 | 7161 -> 7155 -> 7191 -> 7185 ( -6, +30, +24) GFLOPS
problemSize = 1021 | A B | 896 -> 896 -> 832 -> 832 | 7165 -> 7150 -> 7224 -> 7214 ( -15, +59, +49) GFLOPS
problemSize = 1022 | A B | 896 -> 896 -> 832 -> 832 | 7217 -> 7208 -> 7231 -> 7234 ( -9, +14, +17) GFLOPS
problemSize = 1023 | A B | 896 -> 896 -> 832 -> 832 | 7224 -> 7204 -> 7255 -> 7256 ( -20, +31, +32) GFLOPS
problemSize = 1024 | A B | 896 -> 896 -> 832 -> 832 | 7304 -> 7283 -> 7313 -> 7320 ( -21, +9, +16) GFLOPS
problemSize = 976 | A B^T | 896 -> 896 -> 896 -> 896 | 7195 -> 7177 -> 7211 -> 7178 ( -18, +16, -17) GFLOPS
problemSize = 977 | A B^T | 832 -> 832 -> 768 -> 768 | 6673 -> 6689 -> 6727 -> 6736 ( +16, +54, +63) GFLOPS
problemSize = 978 | A B^T | 832 -> 832 -> 768 -> 768 | 6857 -> 6862 -> 6913 -> 6924 ( +5, +56, +67) GFLOPS
problemSize = 979 | A B^T | 832 -> 832 -> 768 -> 768 | 6833 -> 6834 -> 6883 -> 6878 ( +1, +50, +45) GFLOPS
problemSize = 980 | A B^T | 832 -> 832 -> 768 -> 768 | 7104 -> 7102 -> 7176 -> 7167 ( -2, +72, +63) GFLOPS
problemSize = 981 | A B^T | 832 -> 832 -> 768 -> 768 | 6985 -> 6994 -> 7051 -> 7031 ( +9, +66, +46) GFLOPS
problemSize = 982 | A B^T | 832 -> 832 -> 768 -> 768 | 7123 -> 7159 -> 7202 -> 7173 ( +36, +79, +50) GFLOPS
problemSize = 983 | A B^T | 832 -> 832 -> 768 -> 768 | 7129 -> 7154 -> 7160 -> 7140 ( +25, +31, +11) GFLOPS
problemSize = 984 | A B^T | 768 -> 768 -> 832 -> 832 | 7465 -> 7475 -> 7455 -> 7460 ( +10, -10, -5) GFLOPS
problemSize = 985 | A B^T | 896 -> 896 -> 832 -> 832 | 6487 -> 6478 -> 6595 -> 6574 ( -9, +108, +87) GFLOPS
problemSize = 986 | A B^T | 896 -> 896 -> 832 -> 832 | 6657 -> 6661 -> 6761 -> 6757 ( +4, +104, +100) GFLOPS
problemSize = 987 | A B^T | 896 -> 896 -> 832 -> 832 | 6637 -> 6653 -> 6732 -> 6737 ( +16, +95, +100) GFLOPS
problemSize = 988 | A B^T | 896 -> 896 -> 832 -> 832 | 6931 -> 6943 -> 7042 -> 7004 ( +12, +111, +73) GFLOPS
problemSize = 989 | A B^T | 896 -> 896 -> 832 -> 832 | 6804 -> 6792 -> 6845 -> 6863 ( -12, +41, +59) GFLOPS
problemSize = 990 | A B^T | 896 -> 896 -> 832 -> 832 | 6968 -> 6986 -> 7052 -> 7035 ( +18, +84, +67) GFLOPS
problemSize = 991 | A B^T | 896 -> 896 -> 832 -> 832 | 6957 -> 6953 -> 6993 -> 6969 ( -4, +36, +12) GFLOPS
problemSize = 992 | A B^T | 896 -> 896 -> 896 -> 896 | 7298 -> 7321 -> 7314 -> 7313 ( +23, +16, +15) GFLOPS
problemSize = 993 | A B^T | 896 -> 896 -> 832 -> 832 | 6589 -> 6599 -> 6645 -> 6644 ( +10, +56, +55) GFLOPS
problemSize = 994 | A B^T | 896 -> 896 -> 832 -> 832 | 6756 -> 6775 -> 6899 -> 6842 ( +19, +143, +86) GFLOPS
problemSize = 995 | A B^T | 896 -> 896 -> 832 -> 832 | 6742 -> 6744 -> 6825 -> 6820 ( +2, +83, +78) GFLOPS
problemSize = 996 | A B^T | 896 -> 896 -> 832 -> 832 | 7025 -> 7005 -> 7115 -> 7143 ( -20, +90, +118) GFLOPS
problemSize = 997 | A B^T | 896 -> 896 -> 832 -> 832 | 6915 -> 6910 -> 6990 -> 6958 ( -5, +75, +43) GFLOPS
problemSize = 998 | A B^T | 896 -> 896 -> 832 -> 832 | 7087 -> 7100 -> 7169 -> 7158 ( +13, +82, +71) GFLOPS
problemSize = 999 | A B^T | 896 -> 896 -> 832 -> 832 | 7097 -> 7084 -> 7132 -> 7136 ( -13, +35, +39) GFLOPS
problemSize = 1000 | A B^T | 896 -> 896 -> 896 -> 896 | 7316 -> 7306 -> 7275 -> 7293 ( -10, -41, -23) GFLOPS
problemSize = 1001 | A B^T | 832 -> 832 -> 768 -> 768 | 6801 -> 6792 -> 6834 -> 6857 ( -9, +33, +56) GFLOPS
problemSize = 1002 | A B^T | 832 -> 832 -> 768 -> 768 | 6975 -> 6951 -> 7017 -> 7045 ( -24, +42, +70) GFLOPS
problemSize = 1003 | A B^T | 832 -> 832 -> 768 -> 768 | 6963 -> 6944 -> 7011 -> 6986 ( -19, +48, +23) GFLOPS
problemSize = 1004 | A B^T | 832 -> 832 -> 768 -> 768 | 7236 -> 7203 -> 7318 -> 7271 ( -33, +82, +35) GFLOPS
problemSize = 1005 | A B^T | 832 -> 832 -> 768 -> 768 | 7049 -> 7032 -> 7106 -> 7103 ( -17, +57, +54) GFLOPS
problemSize = 1006 | A B^T | 832 -> 832 -> 768 -> 768 | 7242 -> 7258 -> 7301 -> 7315 ( +16, +59, +73) GFLOPS
problemSize = 1007 | A B^T | 832 -> 832 -> 768 -> 768 | 7201 -> 7192 -> 7206 -> 7223 ( -9, +5, +22) GFLOPS
problemSize = 1008 | A B^T | 768 -> 768 -> 832 -> 832 | 7576 -> 7585 -> 7579 -> 7547 ( +9, +3, -29) GFLOPS
problemSize = 1009 | A B^T | 896 -> 896 -> 832 -> 832 | 6508 -> 6506 -> 6622 -> 6636 ( -2, +114, +128) GFLOPS
problemSize = 1010 | A B^T | 896 -> 896 -> 832 -> 832 | 6652 -> 6651 -> 6800 -> 6789 ( -1, +148, +137) GFLOPS
problemSize = 1011 | A B^T | 896 -> 896 -> 832 -> 832 | 6673 -> 6678 -> 6807 -> 6824 ( +5, +134, +151) GFLOPS
problemSize = 1012 | A B^T | 896 -> 896 -> 832 -> 832 | 6914 -> 6919 -> 7081 -> 7068 ( +5, +167, +154) GFLOPS
problemSize = 1013 | A B^T | 896 -> 896 -> 832 -> 832 | 6890 -> 6911 -> 7013 -> 7002 ( +21, +123, +112) GFLOPS
problemSize = 1014 | A B^T | 896 -> 896 -> 832 -> 832 | 7013 -> 7007 -> 7134 -> 7130 ( -6, +121, +117) GFLOPS
problemSize = 1015 | A B^T | 896 -> 896 -> 832 -> 832 | 7034 -> 7030 -> 7165 -> 7153 ( -4, +131, +119) GFLOPS
problemSize = 1016 | A B^T | 896 -> 896 -> 896 -> 896 | 7204 -> 7203 -> 7270 -> 7259 ( -1, +66, +55) GFLOPS
problemSize = 1017 | A B^T | 896 -> 896 -> 832 -> 832 | 6724 -> 6725 -> 6791 -> 6791 ( +1, +67, +67) GFLOPS
problemSize = 1018 | A B^T | 896 -> 896 -> 832 -> 832 | 6828 -> 6840 -> 6911 -> 6905 ( +12, +83, +77) GFLOPS
problemSize = 1019 | A B^T | 896 -> 896 -> 832 -> 832 | 6866 -> 6886 -> 6940 -> 6949 ( +20, +74, +83) GFLOPS
problemSize = 1020 | A B^T | 896 -> 896 -> 832 -> 832 | 7052 -> 7044 -> 7125 -> 7127 ( -8, +73, +75) GFLOPS
problemSize = 1021 | A B^T | 896 -> 896 -> 832 -> 832 | 7033 -> 7031 -> 7109 -> 7116 ( -2, +76, +83) GFLOPS
problemSize = 1022 | A B^T | 896 -> 896 -> 832 -> 832 | 7132 -> 7147 -> 7196 -> 7186 ( +15, +64, +54) GFLOPS
problemSize = 1023 | A B^T | 896 -> 896 -> 832 -> 832 | 7205 -> 7194 -> 7231 -> 7238 ( -11, +26, +33) GFLOPS
problemSize = 1024 | A B^T | 896 -> 896 -> 896 -> 896 | 7304 -> 7265 -> 7337 -> 7340 ( -39, +33, +36) GFLOPS
For this next entry only:
0 - device store
1 - threadgroup store
problemSize = 1488 | A B | 768 -> 832 | 8182 -> 8194 ( +12) GFLOPS
problemSize = 1489 | A B | 896 -> 832 | 7762 -> 7849 ( +87) GFLOPS
problemSize = 1490 | A B | 896 -> 832 | 7852 -> 7924 ( +72) GFLOPS
problemSize = 1491 | A B | 896 -> 832 | 7858 -> 7924 ( +66) GFLOPS
problemSize = 1492 | A B | 896 -> 832 | 7970 -> 8006 ( +36) GFLOPS
problemSize = 1493 | A B | 896 -> 832 | 7932 -> 7963 ( +31) GFLOPS
problemSize = 1494 | A B | 896 -> 832 | 7973 -> 8011 ( +38) GFLOPS
problemSize = 1495 | A B | 896 -> 832 | 7967 -> 8014 ( +47) GFLOPS
problemSize = 1496 | A B | 896 -> 832 | 8044 -> 8110 ( +66) GFLOPS
problemSize = 1497 | A B | 896 -> 832 | 7779 -> 7879 (+100) GFLOPS
problemSize = 1498 | A B | 896 -> 832 | 7883 -> 7961 ( +78) GFLOPS
problemSize = 1499 | A B | 896 -> 832 | 7884 -> 7973 ( +89) GFLOPS
problemSize = 1500 | A B | 896 -> 832 | 8008 -> 8052 ( +44) GFLOPS
problemSize = 1501 | A B | 896 -> 832 | 7984 -> 8043 ( +59) GFLOPS
problemSize = 1502 | A B | 896 -> 832 | 8042 -> 8083 ( +41) GFLOPS
problemSize = 1503 | A B | 896 -> 832 | 8050 -> 8091 ( +41) GFLOPS
problemSize = 1504 | A B | 896 -> 832 | 8131 -> 8162 ( +31) GFLOPS
problemSize = 1505 | A B | 832 -> 768 | 7948 -> 8008 ( +60) GFLOPS
problemSize = 1506 | A B | 832 -> 768 | 8030 -> 8083 ( +53) GFLOPS
problemSize = 1507 | A B | 832 -> 768 | 8040 -> 8095 ( +55) GFLOPS
problemSize = 1508 | A B | 832 -> 768 | 8154 -> 8174 ( +20) GFLOPS
problemSize = 1509 | A B | 832 -> 768 | 8116 -> 8171 ( +55) GFLOPS
problemSize = 1510 | A B | 832 -> 768 | 8182 -> 8202 ( +20) GFLOPS
problemSize = 1511 | A B | 832 -> 768 | 8138 -> 8215 ( +77) GFLOPS
problemSize = 1512 | A B | 768 -> 832 | 8271 -> 8286 ( +15) GFLOPS
problemSize = 1513 | A B | 896 -> 832 | 7928 -> 8004 ( +76) GFLOPS
problemSize = 1514 | A B | 896 -> 832 | 8017 -> 8081 ( +64) GFLOPS
problemSize = 1515 | A B | 896 -> 832 | 8029 -> 8095 ( +66) GFLOPS
problemSize = 1516 | A B | 896 -> 832 | 8133 -> 8169 ( +36) GFLOPS
problemSize = 1517 | A B | 896 -> 832 | 8093 -> 8143 ( +50) GFLOPS
problemSize = 1518 | A B | 896 -> 832 | 8141 -> 8184 ( +43) GFLOPS
problemSize = 1519 | A B | 896 -> 832 | 8145 -> 8186 ( +41) GFLOPS
problemSize = 1520 | A B | 896 -> 832 | 8203 -> 8286 ( +83) GFLOPS
problemSize = 1521 | A B | 896 -> 832 | 7968 -> 8074 (+106) GFLOPS
problemSize = 1522 | A B | 896 -> 832 | 8044 -> 8129 ( +85) GFLOPS
problemSize = 1523 | A B | 896 -> 832 | 8056 -> 8139 ( +83) GFLOPS
problemSize = 1524 | A B | 896 -> 832 | 8153 -> 8202 ( +49) GFLOPS
problemSize = 1525 | A B | 896 -> 832 | 8132 -> 8188 ( +56) GFLOPS
problemSize = 1526 | A B | 896 -> 832 | 8190 -> 8235 ( +45) GFLOPS
problemSize = 1527 | A B | 896 -> 832 | 8213 -> 8252 ( +39) GFLOPS
problemSize = 1528 | A B | 896 -> 832 | 8281 -> 8313 ( +32) GFLOPS
problemSize = 1529 | A B | 832 -> 768 | 8098 -> 8161 ( +63) GFLOPS
problemSize = 1530 | A B | 832 -> 768 | 8164 -> 8222 ( +58) GFLOPS
problemSize = 1531 | A B | 832 -> 768 | 8186 -> 8234 ( +48) GFLOPS
problemSize = 1532 | A B | 832 -> 768 | 8296 -> 8316 ( +20) GFLOPS
problemSize = 1533 | A B | 832 -> 768 | 8263 -> 8313 ( +50) GFLOPS
problemSize = 1534 | A B | 832 -> 768 | 8317 -> 8339 ( +22) GFLOPS
problemSize = 1535 | A B | 832 -> 768 | 8265 -> 8355 ( +90) GFLOPS
problemSize = 1536 | A B | 768 -> 832 | 8429 -> 8447 ( +18) GFLOPS
problemSize = 1488 | A B^T | 768 -> 832 | 8244 -> 8230 ( -14) GFLOPS
problemSize = 1489 | A B^T | 896 -> 832 | 7264 -> 7352 ( +88) GFLOPS
problemSize = 1490 | A B^T | 896 -> 832 | 7484 -> 7625 (+141) GFLOPS
problemSize = 1491 | A B^T | 896 -> 832 | 7392 -> 7473 ( +81) GFLOPS
problemSize = 1492 | A B^T | 896 -> 832 | 7801 -> 7931 (+130) GFLOPS
problemSize = 1493 | A B^T | 896 -> 832 | 7455 -> 7527 ( +72) GFLOPS
problemSize = 1494 | A B^T | 896 -> 832 | 7670 -> 7732 ( +62) GFLOPS
problemSize = 1495 | A B^T | 896 -> 832 | 7557 -> 7632 ( +75) GFLOPS
problemSize = 1496 | A B^T | 896 -> 896 | 8091 -> 8146 ( +55) GFLOPS
problemSize = 1497 | A B^T | 896 -> 832 | 7277 -> 7341 ( +64) GFLOPS
problemSize = 1498 | A B^T | 896 -> 832 | 7520 -> 7573 ( +53) GFLOPS
problemSize = 1499 | A B^T | 896 -> 832 | 7464 -> 7494 ( +30) GFLOPS
problemSize = 1500 | A B^T | 896 -> 832 | 7872 -> 7979 (+107) GFLOPS
problemSize = 1501 | A B^T | 896 -> 832 | 7599 -> 7651 ( +52) GFLOPS
problemSize = 1502 | A B^T | 896 -> 832 | 7852 -> 7884 ( +32) GFLOPS
problemSize = 1503 | A B^T | 896 -> 832 | 7713 -> 7736 ( +23) GFLOPS
problemSize = 1504 | A B^T | 896 -> 896 | 8107 -> 8134 ( +27) GFLOPS
problemSize = 1505 | A B^T | 832 -> 768 | 7465 -> 7492 ( +27) GFLOPS
problemSize = 1506 | A B^T | 832 -> 768 | 7686 -> 7698 ( +12) GFLOPS
problemSize = 1507 | A B^T | 832 -> 768 | 7589 -> 7628 ( +39) GFLOPS
problemSize = 1508 | A B^T | 832 -> 768 | 8051 -> 8124 ( +73) GFLOPS
problemSize = 1509 | A B^T | 832 -> 768 | 7717 -> 7719 ( +2) GFLOPS
problemSize = 1510 | A B^T | 832 -> 768 | 7901 -> 7918 ( +17) GFLOPS
problemSize = 1511 | A B^T | 832 -> 768 | 7771 -> 7777 ( +6) GFLOPS
problemSize = 1512 | A B^T | 768 -> 832 | 8354 -> 8338 ( -16) GFLOPS
problemSize = 1513 | A B^T | 896 -> 832 | 7479 -> 7528 ( +49) GFLOPS
problemSize = 1514 | A B^T | 896 -> 832 | 7695 -> 7796 (+101) GFLOPS
problemSize = 1515 | A B^T | 896 -> 832 | 7657 -> 7709 ( +52) GFLOPS
problemSize = 1516 | A B^T | 896 -> 832 | 7975 -> 8134 (+159) GFLOPS
problemSize = 1517 | A B^T | 896 -> 832 | 7771 -> 7807 ( +36) GFLOPS
problemSize = 1518 | A B^T | 896 -> 832 | 7978 -> 8030 ( +52) GFLOPS
problemSize = 1519 | A B^T | 896 -> 832 | 7893 -> 7908 ( +15) GFLOPS
problemSize = 1520 | A B^T | 896 -> 896 | 8242 -> 8313 ( +71) GFLOPS
problemSize = 1521 | A B^T | 896 -> 832 | 7593 -> 7635 ( +42) GFLOPS
problemSize = 1522 | A B^T | 896 -> 832 | 7785 -> 7853 ( +68) GFLOPS
problemSize = 1523 | A B^T | 896 -> 832 | 7653 -> 7706 ( +53) GFLOPS
problemSize = 1524 | A B^T | 896 -> 832 | 8002 -> 8105 (+103) GFLOPS
problemSize = 1525 | A B^T | 896 -> 832 | 7708 -> 7744 ( +36) GFLOPS
problemSize = 1526 | A B^T | 896 -> 832 | 7967 -> 8010 ( +43) GFLOPS
problemSize = 1527 | A B^T | 896 -> 832 | 7906 -> 7926 ( +20) GFLOPS
problemSize = 1528 | A B^T | 896 -> 896 | 8273 -> 8307 ( +34) GFLOPS
problemSize = 1529 | A B^T | 832 -> 768 | 7640 -> 7642 ( +2) GFLOPS
problemSize = 1530 | A B^T | 832 -> 768 | 7809 -> 7844 ( +35) GFLOPS
problemSize = 1531 | A B^T | 832 -> 768 | 7743 -> 7738 ( -5) GFLOPS
problemSize = 1532 | A B^T | 832 -> 768 | 8187 -> 8245 ( +58) GFLOPS
problemSize = 1533 | A B^T | 832 -> 768 | 7876 -> 7895 ( +19) GFLOPS
problemSize = 1534 | A B^T | 832 -> 768 | 8047 -> 8079 ( +32) GFLOPS
problemSize = 1535 | A B^T | 832 -> 768 | 7887 -> 7899 ( +12) GFLOPS
problemSize = 1536 | A B^T | 768 -> 832 | 8491 -> 8479 ( -12) GFLOPS
FP16 (48x48x24)
problemSize = 976 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8130 -> 8139 -> 8160 -> 8145 ( +9, +30, +15) GFLOPS
problemSize = 977 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7949 -> 7956 -> 7988 -> 7985 ( +7, +39, +36) GFLOPS
problemSize = 978 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8019 -> 8000 -> 8034 -> 8023 ( -19, +15, +4) GFLOPS
problemSize = 979 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8007 -> 8016 -> 8049 -> 8038 ( +9, +42, +31) GFLOPS
problemSize = 980 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8070 -> 8077 -> 8087 -> 8098 ( +7, +17, +28) GFLOPS
problemSize = 981 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8085 -> 8090 -> 8121 -> 8127 ( +5, +36, +42) GFLOPS
problemSize = 982 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8111 -> 8113 -> 8142 -> 8156 ( +2, +31, +45) GFLOPS
problemSize = 983 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8140 -> 8159 -> 8173 -> 8159 ( +19, +33, +19) GFLOPS
problemSize = 984 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8234 -> 8187 -> 8189 -> 8198 ( -47, -45, -36) GFLOPS
problemSize = 985 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8064 -> 8071 -> 8108 -> 8143 ( +7, +44, +79) GFLOPS
problemSize = 986 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8151 -> 8131 -> 8143 -> 8154 ( -20, -8, +3) GFLOPS
problemSize = 987 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8147 -> 8165 -> 8186 -> 8175 ( +18, +39, +28) GFLOPS
problemSize = 988 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8211 -> 8214 -> 8228 -> 8250 ( +3, +17, +39) GFLOPS
problemSize = 989 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8215 -> 8223 -> 8257 -> 8247 ( +8, +42, +32) GFLOPS
problemSize = 990 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8297 -> 8294 -> 8290 -> 8289 ( -3, -7, -8) GFLOPS
problemSize = 991 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8269 -> 8316 -> 8277 -> 8288 ( +47, +8, +19) GFLOPS
problemSize = 992 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8384 -> 8359 -> 8347 -> 8349 ( -25, -37, -35) GFLOPS
problemSize = 993 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8269 -> 8245 -> 8275 -> 8309 ( -24, +6, +40) GFLOPS
problemSize = 994 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8279 -> 8281 -> 8309 -> 8328 ( +2, +30, +49) GFLOPS
problemSize = 995 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8304 -> 8320 -> 8381 -> 8350 ( +16, +77, +46) GFLOPS
problemSize = 996 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8353 -> 8373 -> 8397 -> 8387 ( +20, +44, +34) GFLOPS
problemSize = 997 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8372 -> 8372 -> 8409 -> 8408 ( 0, +37, +36) GFLOPS
problemSize = 998 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8412 -> 8421 -> 8437 -> 8456 ( +9, +25, +44) GFLOPS
problemSize = 999 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8428 -> 8437 -> 8465 -> 8457 ( +9, +37, +29) GFLOPS
problemSize = 1000 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8488 -> 8486 -> 8524 -> 8511 ( -2, +36, +23) GFLOPS
problemSize = 1001 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8320 -> 8332 -> 8327 -> 8341 ( +12, +7, +21) GFLOPS
problemSize = 1002 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8380 -> 8348 -> 8379 -> 8389 ( -32, -1, +9) GFLOPS
problemSize = 1003 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8379 -> 8375 -> 8406 -> 8398 ( -4, +27, +19) GFLOPS
problemSize = 1004 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8428 -> 8419 -> 8439 -> 8434 ( -9, +11, +6) GFLOPS
problemSize = 1005 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8436 -> 8446 -> 8481 -> 8455 ( +10, +45, +19) GFLOPS
problemSize = 1006 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8509 -> 8504 -> 8484 -> 8490 ( -5, -25, -19) GFLOPS
problemSize = 1007 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8532 -> 8529 -> 8551 -> 8518 ( -3, +19, -14) GFLOPS
problemSize = 1008 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8580 -> 8555 -> 8537 -> 8556 ( -25, -43, -24) GFLOPS
problemSize = 1009 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7922 -> 7902 -> 7915 -> 7932 ( -20, -7, +10) GFLOPS
problemSize = 1010 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7950 -> 7951 -> 7957 -> 7953 ( +1, +7, +3) GFLOPS
problemSize = 1011 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7969 -> 7968 -> 7978 -> 7981 ( -1, +9, +12) GFLOPS
problemSize = 1012 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8037 -> 8004 -> 8006 -> 8008 ( -33, -31, -29) GFLOPS
problemSize = 1013 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8025 -> 8026 -> 8025 -> 8050 ( +1, 0, +25) GFLOPS
problemSize = 1014 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8064 -> 8052 -> 8069 -> 8068 ( -12, +5, +4) GFLOPS
problemSize = 1015 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8074 -> 8085 -> 8073 -> 8085 ( +11, -1, +11) GFLOPS
problemSize = 1016 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8116 -> 8117 -> 8110 -> 8117 ( +1, -6, +1) GFLOPS
problemSize = 1017 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8041 -> 8027 -> 8073 -> 8100 ( -14, +32, +59) GFLOPS
problemSize = 1018 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8059 -> 8078 -> 8136 -> 8104 ( +19, +77, +45) GFLOPS
problemSize = 1019 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8105 -> 8091 -> 8138 -> 8132 ( -14, +33, +27) GFLOPS
problemSize = 1020 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8167 -> 8148 -> 8184 -> 8173 ( -19, +17, +6) GFLOPS
problemSize = 1021 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8171 -> 8155 -> 8187 -> 8189 ( -16, +16, +18) GFLOPS
problemSize = 1022 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8188 -> 8188 -> 8228 -> 8221 ( 0, +40, +33) GFLOPS
problemSize = 1023 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8206 -> 8220 -> 8241 -> 8248 ( +14, +35, +42) GFLOPS
problemSize = 1024 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8266 -> 8260 -> 8300 -> 8280 ( -6, +34, +14) GFLOPS
problemSize = 976 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8091 -> 8083 -> 8086 -> 8094 ( -8, -5, +3) GFLOPS
problemSize = 977 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7685 -> 7679 -> 7750 -> 7744 ( -6, +65, +59) GFLOPS
problemSize = 978 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7796 -> 7793 -> 7861 -> 7874 ( -3, +65, +78) GFLOPS
problemSize = 979 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7879 -> 7879 -> 7967 -> 7954 ( 0, +88, +75) GFLOPS
problemSize = 980 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7999 -> 8030 -> 8063 -> 8033 ( +31, +64, +34) GFLOPS
problemSize = 981 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8047 -> 8068 -> 8077 -> 8077 ( +21, +30, +30) GFLOPS
problemSize = 982 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8130 -> 8107 -> 8131 -> 8129 ( -23, +1, -1) GFLOPS
problemSize = 983 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8128 -> 8125 -> 8164 -> 8170 ( -3, +36, +42) GFLOPS
problemSize = 984 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8229 -> 8215 -> 8251 -> 8250 ( -14, +22, +21) GFLOPS
problemSize = 985 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7811 -> 7821 -> 7875 -> 7848 ( +10, +64, +37) GFLOPS
problemSize = 986 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7908 -> 7929 -> 7967 -> 7956 ( +21, +59, +48) GFLOPS
problemSize = 987 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7986 -> 8004 -> 8017 -> 8025 ( +18, +31, +39) GFLOPS
problemSize = 988 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8098 -> 8114 -> 8123 -> 8116 ( +16, +25, +18) GFLOPS
problemSize = 989 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8140 -> 8174 -> 8141 -> 8166 ( +34, +1, +26) GFLOPS
problemSize = 990 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8222 -> 8215 -> 8223 -> 8183 ( -7, +1, -39) GFLOPS
problemSize = 991 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8285 -> 8282 -> 8213 -> 8207 ( -3, -72, -78) GFLOPS
problemSize = 992 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8357 -> 8351 -> 8339 -> 8342 ( -6, -18, -15) GFLOPS
problemSize = 993 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7906 -> 7922 -> 8006 -> 7982 ( +16, +100, +76) GFLOPS
problemSize = 994 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8015 -> 8023 -> 8092 -> 8117 ( +8, +77, +102) GFLOPS
problemSize = 995 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8068 -> 8093 -> 8144 -> 8142 ( +25, +76, +74) GFLOPS
problemSize = 996 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8201 -> 8216 -> 8247 -> 8237 ( +15, +46, +36) GFLOPS
problemSize = 997 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8281 -> 8260 -> 8276 -> 8281 ( -21, -5, 0) GFLOPS
problemSize = 998 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8341 -> 8328 -> 8327 -> 8324 ( -13, -14, -17) GFLOPS
problemSize = 999 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8349 -> 8365 -> 8364 -> 8366 ( +16, +15, +17) GFLOPS
problemSize = 1000 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8442 -> 8465 -> 8495 -> 8454 ( +23, +53, +12) GFLOPS
problemSize = 1001 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8070 -> 8081 -> 8115 -> 8115 ( +11, +45, +45) GFLOPS
problemSize = 1002 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8206 -> 8177 -> 8217 -> 8225 ( -29, +11, +19) GFLOPS
problemSize = 1003 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8232 -> 8245 -> 8284 -> 8292 ( +13, +52, +60) GFLOPS
problemSize = 1004 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8345 -> 8369 -> 8402 -> 8386 ( +24, +57, +41) GFLOPS
problemSize = 1005 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8376 -> 8388 -> 8429 -> 8430 ( +12, +53, +54) GFLOPS
problemSize = 1006 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8471 -> 8466 -> 8499 -> 8486 ( -5, +28, +15) GFLOPS
problemSize = 1007 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8476 -> 8501 -> 8518 -> 8514 ( +25, +42, +38) GFLOPS
problemSize = 1008 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8619 -> 8580 -> 8628 -> 8653 ( -39, +9, +34) GFLOPS
problemSize = 1009 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7724 -> 7759 -> 7777 -> 7751 ( +35, +53, +27) GFLOPS
problemSize = 1010 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7835 -> 7838 -> 7847 -> 7831 ( +3, +12, -4) GFLOPS
problemSize = 1011 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7898 -> 7892 -> 7881 -> 7877 ( -6, -17, -21) GFLOPS
problemSize = 1012 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7984 -> 8006 -> 7939 -> 7973 ( +22, -45, -11) GFLOPS
problemSize = 1013 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8037 -> 8014 -> 7972 -> 7971 ( -23, -65, -66) GFLOPS
problemSize = 1014 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8071 -> 8071 -> 8005 -> 8002 ( 0, -66, -69) GFLOPS
problemSize = 1015 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8090 -> 8094 -> 8035 -> 8036 ( +4, -55, -54) GFLOPS
problemSize = 1016 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8177 -> 8144 -> 8121 -> 8150 ( -33, -56, -27) GFLOPS
problemSize = 1017 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7854 -> 7854 -> 7870 -> 7875 ( 0, +16, +21) GFLOPS
problemSize = 1018 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7954 -> 7948 -> 7951 -> 7946 ( -6, -3, -8) GFLOPS
problemSize = 1019 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8027 -> 8021 -> 8019 -> 8010 ( -6, -8, -17) GFLOPS
problemSize = 1020 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8119 -> 8102 -> 8077 -> 8101 ( -17, -42, -18) GFLOPS
problemSize = 1021 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8134 -> 8154 -> 8126 -> 8119 ( +20, -8, -15) GFLOPS
problemSize = 1022 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8206 -> 8185 -> 8133 -> 8129 ( -21, -73, -77) GFLOPS
problemSize = 1023 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8218 -> 8214 -> 8189 -> 8201 ( -4, -29, -17) GFLOPS
problemSize = 1024 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8282 -> 8302 -> 8284 -> 8281 ( +20, +2, -1) GFLOPS
problemSize = 976 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8149 -> 8152 -> 8129 -> 8106 ( +3, -20, -43) GFLOPS
problemSize = 977 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8057 -> 8080 -> 8026 -> 8021 ( +23, -31, -36) GFLOPS
problemSize = 978 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8086 -> 8087 -> 8056 -> 8045 ( +1, -30, -41) GFLOPS
problemSize = 979 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8107 -> 8115 -> 8075 -> 8078 ( +8, -32, -29) GFLOPS
problemSize = 980 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8156 -> 8137 -> 8119 -> 8121 ( -19, -37, -35) GFLOPS
problemSize = 981 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8151 -> 8149 -> 8126 -> 8133 ( -2, -25, -18) GFLOPS
problemSize = 982 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8186 -> 8211 -> 8149 -> 8151 ( +25, -37, -35) GFLOPS
problemSize = 983 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8206 -> 8206 -> 8177 -> 8157 ( 0, -29, -49) GFLOPS
problemSize = 984 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8208 -> 8198 -> 8235 -> 8235 ( -10, +27, +27) GFLOPS
problemSize = 985 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8175 -> 8179 -> 8137 -> 8130 ( +4, -38, -45) GFLOPS
problemSize = 986 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8190 -> 8215 -> 8202 -> 8158 ( +25, +12, -32) GFLOPS
problemSize = 987 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8202 -> 8221 -> 8198 -> 8185 ( +19, -4, -17) GFLOPS
problemSize = 988 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8258 -> 8270 -> 8222 -> 8214 ( +12, -36, -44) GFLOPS
problemSize = 989 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8265 -> 8270 -> 8250 -> 8278 ( +5, -15, +13) GFLOPS
problemSize = 990 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8288 -> 8293 -> 8256 -> 8265 ( +5, -32, -23) GFLOPS
problemSize = 991 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8318 -> 8347 -> 8285 -> 8301 ( +29, -33, -17) GFLOPS
problemSize = 992 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8375 -> 8406 -> 8394 -> 8368 ( +31, +19, -7) GFLOPS
problemSize = 993 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8339 -> 8298 -> 8303 -> 8304 ( -41, -36, -35) GFLOPS
problemSize = 994 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8330 -> 8336 -> 8349 -> 8327 ( +6, +19, -3) GFLOPS
problemSize = 995 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8362 -> 8374 -> 8347 -> 8376 ( +12, -15, +14) GFLOPS
problemSize = 996 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8418 -> 8401 -> 8404 -> 8385 ( -17, -14, -33) GFLOPS
problemSize = 997 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8394 -> 8425 -> 8397 -> 8410 ( +31, +3, +16) GFLOPS
problemSize = 998 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8468 -> 8449 -> 8424 -> 8423 ( -19, -44, -45) GFLOPS
problemSize = 999 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8473 -> 8479 -> 8451 -> 8458 ( +6, -22, -15) GFLOPS
problemSize = 1000 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8502 -> 8511 -> 8479 -> 8478 ( +9, -23, -24) GFLOPS
problemSize = 1001 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8442 -> 8439 -> 8415 -> 8437 ( -3, -27, -5) GFLOPS
problemSize = 1002 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8459 -> 8494 -> 8416 -> 8423 ( +35, -43, -36) GFLOPS
problemSize = 1003 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8473 -> 8486 -> 8456 -> 8465 ( +13, -17, -8) GFLOPS
problemSize = 1004 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8511 -> 8527 -> 8478 -> 8473 ( +16, -33, -38) GFLOPS
problemSize = 1005 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8529 -> 8551 -> 8499 -> 8537 ( +22, -30, +8) GFLOPS
problemSize = 1006 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8595 -> 8581 -> 8529 -> 8525 ( -14, -66, -70) GFLOPS
problemSize = 1007 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8590 -> 8617 -> 8532 -> 8536 ( +27, -58, -54) GFLOPS
problemSize = 1008 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8585 -> 8598 -> 8615 -> 8619 ( +13, +30, +34) GFLOPS
problemSize = 1009 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 7974 -> 7993 -> 7963 -> 7961 ( +19, -11, -13) GFLOPS
problemSize = 1010 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8011 -> 8008 -> 7993 -> 7992 ( -3, -18, -19) GFLOPS
problemSize = 1011 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8038 -> 8030 -> 8029 -> 8018 ( -8, -9, -20) GFLOPS
problemSize = 1012 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8048 -> 8060 -> 8064 -> 8033 ( +12, +16, -15) GFLOPS
problemSize = 1013 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8075 -> 8073 -> 8065 -> 8069 ( -2, -10, -6) GFLOPS
problemSize = 1014 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8102 -> 8099 -> 8083 -> 8081 ( -3, -19, -21) GFLOPS
problemSize = 1015 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8125 -> 8157 -> 8110 -> 8105 ( +32, -15, -20) GFLOPS
problemSize = 1016 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8164 -> 8164 -> 8159 -> 8167 ( 0, -5, +3) GFLOPS
problemSize = 1017 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8116 -> 8140 -> 8135 -> 8119 ( +24, +19, +3) GFLOPS
problemSize = 1018 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8138 -> 8145 -> 8129 -> 8133 ( +7, -9, -5) GFLOPS
problemSize = 1019 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8176 -> 8168 -> 8191 -> 8159 ( -8, +15, -17) GFLOPS
problemSize = 1020 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8190 -> 8194 -> 8190 -> 8187 ( +4, 0, -3) GFLOPS
problemSize = 1021 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8220 -> 8210 -> 8219 -> 8216 ( -10, -1, -4) GFLOPS
problemSize = 1022 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8237 -> 8247 -> 8235 -> 8238 ( +10, -2, +1) GFLOPS
problemSize = 1023 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8253 -> 8267 -> 8283 -> 8251 ( +14, +30, -2) GFLOPS
problemSize = 1024 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8307 -> 8298 -> 8287 -> 8295 ( -9, -20, -12) GFLOPS
problemSize = 976 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8090 -> 8054 -> 8074 -> 8071 ( -36, -16, -19) GFLOPS
problemSize = 977 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7936 -> 7939 -> 7998 -> 7977 ( +3, +62, +41) GFLOPS
problemSize = 978 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7996 -> 7998 -> 8025 -> 8024 ( +2, +29, +28) GFLOPS
problemSize = 979 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8044 -> 8029 -> 8066 -> 8053 ( -15, +22, +9) GFLOPS
problemSize = 980 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8108 -> 8104 -> 8096 -> 8137 ( -4, -12, +29) GFLOPS
problemSize = 981 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8093 -> 8111 -> 8118 -> 8115 ( +18, +25, +22) GFLOPS
problemSize = 982 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8158 -> 8147 -> 8152 -> 8167 ( -11, -6, +9) GFLOPS
problemSize = 983 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8144 -> 8164 -> 8178 -> 8195 ( +20, +34, +51) GFLOPS
problemSize = 984 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8200 -> 8217 -> 8224 -> 8223 ( +17, +24, +23) GFLOPS
problemSize = 985 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8010 -> 7996 -> 8031 -> 8030 ( -14, +21, +20) GFLOPS
problemSize = 986 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8043 -> 8077 -> 8072 -> 8069 ( +34, +29, +26) GFLOPS
problemSize = 987 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8086 -> 8113 -> 8105 -> 8106 ( +27, +19, +20) GFLOPS
problemSize = 988 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8119 -> 8155 -> 8145 -> 8153 ( +36, +26, +34) GFLOPS
problemSize = 989 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8154 -> 8166 -> 8171 -> 8209 ( +12, +17, +55) GFLOPS
problemSize = 990 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8179 -> 8217 -> 8206 -> 8195 ( +38, +27, +16) GFLOPS
problemSize = 991 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8211 -> 8234 -> 8222 -> 8218 ( +23, +11, +7) GFLOPS
problemSize = 992 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8286 -> 8301 -> 8326 -> 8299 ( +15, +40, +13) GFLOPS
problemSize = 993 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8138 -> 8132 -> 8203 -> 8250 ( -6, +65, +112) GFLOPS
problemSize = 994 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8184 -> 8191 -> 8254 -> 8299 ( +7, +70, +115) GFLOPS
problemSize = 995 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8239 -> 8210 -> 8276 -> 8283 ( -29, +37, +44) GFLOPS
problemSize = 996 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8267 -> 8276 -> 8326 -> 8332 ( +9, +59, +65) GFLOPS
problemSize = 997 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8320 -> 8304 -> 8345 -> 8366 ( -16, +25, +46) GFLOPS
problemSize = 998 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8315 -> 8351 -> 8385 -> 8405 ( +36, +70, +90) GFLOPS
problemSize = 999 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8350 -> 8353 -> 8400 -> 8398 ( +3, +50, +48) GFLOPS
problemSize = 1000 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8408 -> 8411 -> 8442 -> 8442 ( +3, +34, +34) GFLOPS
problemSize = 1001 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8298 -> 8309 -> 8346 -> 8335 ( +11, +48, +37) GFLOPS
problemSize = 1002 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8351 -> 8370 -> 8416 -> 8395 ( +19, +65, +44) GFLOPS
problemSize = 1003 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8384 -> 8396 -> 8419 -> 8463 ( +12, +35, +79) GFLOPS
problemSize = 1004 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8432 -> 8444 -> 8439 -> 8469 ( +12, +7, +37) GFLOPS
problemSize = 1005 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8459 -> 8493 -> 8486 -> 8475 ( +34, +27, +16) GFLOPS
problemSize = 1006 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8480 -> 8504 -> 8541 -> 8519 ( +24, +61, +39) GFLOPS
problemSize = 1007 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8561 -> 8552 -> 8586 -> 8590 ( -9, +25, +29) GFLOPS
problemSize = 1008 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8587 -> 8608 -> 8610 -> 8592 ( +21, +23, +5) GFLOPS
problemSize = 1009 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7855 -> 7865 -> 7879 -> 7871 ( +10, +24, +16) GFLOPS
problemSize = 1010 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7924 -> 7903 -> 7910 -> 7911 ( -21, -14, -13) GFLOPS
problemSize = 1011 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7953 -> 7947 -> 7939 -> 7950 ( -6, -14, -3) GFLOPS
problemSize = 1012 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7968 -> 7971 -> 7979 -> 7976 ( +3, +11, +8) GFLOPS
problemSize = 1013 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8005 -> 7995 -> 7998 -> 7981 ( -10, -7, -24) GFLOPS
problemSize = 1014 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8041 -> 8027 -> 8013 -> 8032 ( -14, -28, -9) GFLOPS
problemSize = 1015 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8043 -> 8038 -> 8052 -> 8056 ( -5, +9, +13) GFLOPS
problemSize = 1016 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8112 -> 8100 -> 8069 -> 8084 ( -12, -43, -28) GFLOPS
problemSize = 1017 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8001 -> 8012 -> 8060 -> 8062 ( +11, +59, +61) GFLOPS
problemSize = 1018 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8039 -> 8039 -> 8092 -> 8092 ( 0, +53, +53) GFLOPS
problemSize = 1019 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8078 -> 8074 -> 8128 -> 8165 ( -4, +50, +87) GFLOPS
problemSize = 1020 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8111 -> 8112 -> 8154 -> 8160 ( +1, +43, +49) GFLOPS
problemSize = 1021 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8134 -> 8128 -> 8171 -> 8179 ( -6, +37, +45) GFLOPS
problemSize = 1022 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8148 -> 8158 -> 8225 -> 8227 ( +10, +77, +79) GFLOPS
problemSize = 1023 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8175 -> 8208 -> 8235 -> 8233 ( +33, +60, +58) GFLOPS
problemSize = 1024 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8249 -> 8303 -> 8289 -> 8266 ( +54, +40, +17) GFLOPS
FP16 (48x48x32)
problemSize = 976 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8366 -> 8359 -> 8365 -> 8377 ( -7, -1, +11) GFLOPS
problemSize = 977 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8126 -> 8131 -> 8192 -> 8217 ( +5, +66, +91) GFLOPS
problemSize = 978 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8171 -> 8162 -> 8237 -> 8214 ( -9, +66, +43) GFLOPS
problemSize = 979 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8194 -> 8206 -> 8265 -> 8251 ( +12, +71, +57) GFLOPS
problemSize = 980 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8255 -> 8241 -> 8290 -> 8287 ( -14, +35, +32) GFLOPS
problemSize = 981 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8273 -> 8299 -> 8312 -> 8314 ( +26, +39, +41) GFLOPS
problemSize = 982 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8306 -> 8312 -> 8341 -> 8355 ( +6, +35, +49) GFLOPS
problemSize = 983 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8323 -> 8315 -> 8370 -> 8368 ( -8, +47, +45) GFLOPS
problemSize = 984 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8402 -> 8410 -> 8492 -> 8461 ( +8, +90, +59) GFLOPS
problemSize = 985 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8233 -> 8228 -> 8257 -> 8229 ( -5, +24, -4) GFLOPS
problemSize = 986 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8258 -> 8264 -> 8292 -> 8265 ( +6, +34, +7) GFLOPS
problemSize = 987 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8317 -> 8292 -> 8298 -> 8302 ( -25, -19, -15) GFLOPS
problemSize = 988 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8349 -> 8337 -> 8348 -> 8379 ( -12, -1, +30) GFLOPS
problemSize = 989 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8351 -> 8391 -> 8360 -> 8357 ( +40, +9, +6) GFLOPS
problemSize = 990 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8413 -> 8388 -> 8396 -> 8395 ( -25, -17, -18) GFLOPS
problemSize = 991 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8425 -> 8416 -> 8419 -> 8440 ( -9, -6, +15) GFLOPS
problemSize = 992 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8499 -> 8499 -> 8507 -> 8515 ( 0, +8, +16) GFLOPS
problemSize = 993 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8368 -> 8389 -> 8394 -> 8398 ( +21, +26, +30) GFLOPS
problemSize = 994 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8390 -> 8404 -> 8430 -> 8423 ( +14, +40, +33) GFLOPS
problemSize = 995 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8422 -> 8415 -> 8456 -> 8449 ( -7, +34, +27) GFLOPS
problemSize = 996 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8456 -> 8476 -> 8526 -> 8484 ( +20, +70, +28) GFLOPS
problemSize = 997 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8477 -> 8521 -> 8547 -> 8509 ( +44, +70, +32) GFLOPS
problemSize = 998 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8516 -> 8561 -> 8567 -> 8554 ( +45, +51, +38) GFLOPS
problemSize = 999 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8544 -> 8557 -> 8585 -> 8606 ( +13, +41, +62) GFLOPS
problemSize = 1000 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8605 -> 8594 -> 8629 -> 8667 ( -11, +24, +62) GFLOPS
problemSize = 1001 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8597 -> 8598 -> 8634 -> 8646 ( +1, +37, +49) GFLOPS
problemSize = 1002 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8637 -> 8677 -> 8678 -> 8680 ( +40, +41, +43) GFLOPS
problemSize = 1003 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8714 -> 8669 -> 8709 -> 8733 ( -45, -5, +19) GFLOPS
problemSize = 1004 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8731 -> 8764 -> 8745 -> 8746 ( +33, +14, +15) GFLOPS
problemSize = 1005 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8736 -> 8731 -> 8772 -> 8768 ( -5, +36, +32) GFLOPS
problemSize = 1006 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8771 -> 8816 -> 8809 -> 8790 ( +45, +38, +19) GFLOPS
problemSize = 1007 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8799 -> 8802 -> 8828 -> 8823 ( +3, +29, +24) GFLOPS
problemSize = 1008 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8881 -> 8893 -> 8998 -> 8987 ( +12, +117, +106) GFLOPS
problemSize = 1009 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8030 -> 8028 -> 8083 -> 8076 ( -2, +53, +46) GFLOPS
problemSize = 1010 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8073 -> 8068 -> 8132 -> 8118 ( -5, +59, +45) GFLOPS
problemSize = 1011 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8134 -> 8139 -> 8177 -> 8143 ( +5, +43, +9) GFLOPS
problemSize = 1012 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8170 -> 8167 -> 8206 -> 8179 ( -3, +36, +9) GFLOPS
problemSize = 1013 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8182 -> 8142 -> 8193 -> 8201 ( -40, +11, +19) GFLOPS
problemSize = 1014 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8191 -> 8173 -> 8244 -> 8239 ( -18, +53, +48) GFLOPS
problemSize = 1015 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8209 -> 8201 -> 8258 -> 8251 ( -8, +49, +42) GFLOPS
problemSize = 1016 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8272 -> 8253 -> 8286 -> 8293 ( -19, +14, +21) GFLOPS
problemSize = 1017 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7876 -> 7912 -> 7909 -> 7904 ( +36, +33, +28) GFLOPS
problemSize = 1018 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7917 -> 7932 -> 7937 -> 7908 ( +15, +20, -9) GFLOPS
problemSize = 1019 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8028 -> 7913 -> 7897 -> 7966 (-115, -131, -62) GFLOPS
problemSize = 1020 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8004 -> 8017 -> 7964 -> 8016 ( +13, -40, +12) GFLOPS
problemSize = 1021 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7985 -> 8056 -> 8072 -> 7997 ( +71, +87, +12) GFLOPS
problemSize = 1022 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8081 -> 8030 -> 7988 -> 8021 ( -51, -93, -60) GFLOPS
problemSize = 1023 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8113 -> 8059 -> 8122 -> 8033 ( -54, +9, -80) GFLOPS
problemSize = 1024 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8121 -> 8141 -> 8134 -> 8178 ( +20, +13, +57) GFLOPS
problemSize = 976 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8277 -> 8264 -> 8341 -> 8351 ( -13, +64, +74) GFLOPS
problemSize = 977 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7999 -> 8006 -> 8090 -> 8107 ( +7, +91, +108) GFLOPS
problemSize = 978 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8097 -> 8103 -> 8190 -> 8233 ( +6, +93, +136) GFLOPS
problemSize = 979 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8181 -> 8179 -> 8285 -> 8262 ( -2, +104, +81) GFLOPS
problemSize = 980 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8259 -> 8267 -> 8334 -> 8329 ( +8, +75, +70) GFLOPS
problemSize = 981 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8293 -> 8300 -> 8377 -> 8364 ( +7, +84, +71) GFLOPS
problemSize = 982 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8379 -> 8347 -> 8458 -> 8414 ( -32, +79, +35) GFLOPS
problemSize = 983 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8380 -> 8378 -> 8441 -> 8442 ( -2, +61, +62) GFLOPS
problemSize = 984 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8447 -> 8455 -> 8491 -> 8477 ( +8, +44, +30) GFLOPS
problemSize = 985 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8054 -> 8057 -> 8094 -> 8105 ( +3, +40, +51) GFLOPS
problemSize = 986 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8184 -> 8213 -> 8207 -> 8221 ( +29, +23, +37) GFLOPS
problemSize = 987 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8281 -> 8294 -> 8305 -> 8344 ( +13, +24, +63) GFLOPS
problemSize = 988 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8367 -> 8357 -> 8414 -> 8406 ( -10, +47, +39) GFLOPS
problemSize = 989 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8400 -> 8415 -> 8484 -> 8463 ( +15, +84, +63) GFLOPS
problemSize = 990 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8482 -> 8527 -> 8523 -> 8519 ( +45, +41, +37) GFLOPS
problemSize = 991 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8505 -> 8517 -> 8589 -> 8589 ( +12, +84, +84) GFLOPS
problemSize = 992 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8693 -> 8672 -> 8694 -> 8698 ( -21, +1, +5) GFLOPS
problemSize = 993 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8245 -> 8214 -> 8363 -> 8358 ( -31, +118, +113) GFLOPS
problemSize = 994 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8314 -> 8309 -> 8434 -> 8445 ( -5, +120, +131) GFLOPS
problemSize = 995 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8353 -> 8373 -> 8483 -> 8501 ( +20, +130, +148) GFLOPS
problemSize = 996 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8445 -> 8438 -> 8551 -> 8553 ( -7, +106, +108) GFLOPS
problemSize = 997 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8474 -> 8480 -> 8595 -> 8587 ( +6, +121, +113) GFLOPS
problemSize = 998 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8509 -> 8518 -> 8648 -> 8626 ( +9, +139, +117) GFLOPS
problemSize = 999 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8547 -> 8546 -> 8700 -> 8667 ( -1, +153, +120) GFLOPS
problemSize = 1000 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8651 -> 8654 -> 8701 -> 8693 ( +3, +50, +42) GFLOPS
problemSize = 1001 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8344 -> 8366 -> 8426 -> 8419 ( +22, +82, +75) GFLOPS
problemSize = 1002 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8438 -> 8425 -> 8523 -> 8513 ( -13, +85, +75) GFLOPS
problemSize = 1003 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8493 -> 8495 -> 8578 -> 8600 ( +2, +85, +107) GFLOPS
problemSize = 1004 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8567 -> 8576 -> 8663 -> 8657 ( +9, +96, +90) GFLOPS
problemSize = 1005 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8646 -> 8610 -> 8700 -> 8687 ( -36, +54, +41) GFLOPS
problemSize = 1006 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8657 -> 8668 -> 8737 -> 8735 ( +11, +80, +78) GFLOPS
problemSize = 1007 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8690 -> 8693 -> 8775 -> 8783 ( +3, +85, +93) GFLOPS
problemSize = 1008 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8778 -> 8778 -> 8915 -> 8893 ( 0, +137, +115) GFLOPS
problemSize = 1009 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7942 -> 7940 -> 8055 -> 8072 ( -2, +113, +130) GFLOPS
problemSize = 1010 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8026 -> 8027 -> 8108 -> 8112 ( +1, +82, +86) GFLOPS
problemSize = 1011 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8083 -> 8087 -> 8159 -> 8169 ( +4, +76, +86) GFLOPS
problemSize = 1012 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8146 -> 8169 -> 8226 -> 8213 ( +23, +80, +67) GFLOPS
problemSize = 1013 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8222 -> 8199 -> 8240 -> 8240 ( -23, +18, +18) GFLOPS
problemSize = 1014 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8237 -> 8236 -> 8270 -> 8277 ( -1, +33, +40) GFLOPS
problemSize = 1015 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8285 -> 8257 -> 8314 -> 8323 ( -28, +29, +38) GFLOPS
problemSize = 1016 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8305 -> 8311 -> 8352 -> 8353 ( +6, +47, +48) GFLOPS
problemSize = 1017 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8151 -> 8157 -> 8233 -> 8259 ( +6, +82, +108) GFLOPS
problemSize = 1018 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8245 -> 8233 -> 8306 -> 8322 ( -12, +61, +77) GFLOPS
problemSize = 1019 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8307 -> 8316 -> 8391 -> 8367 ( +9, +84, +60) GFLOPS
problemSize = 1020 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8384 -> 8408 -> 8409 -> 8416 ( +24, +25, +32) GFLOPS
problemSize = 1021 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8452 -> 8467 -> 8463 -> 8454 ( +15, +11, +2) GFLOPS
problemSize = 1022 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8458 -> 8463 -> 8474 -> 8482 ( +5, +16, +24) GFLOPS
problemSize = 1023 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8499 -> 8487 -> 8512 -> 8522 ( -12, +13, +23) GFLOPS
problemSize = 1024 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8555 -> 8553 -> 8612 -> 8581 ( -2, +57, +26) GFLOPS
problemSize = 976 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8391 -> 8397 -> 8405 -> 8398 ( +6, +14, +7) GFLOPS
problemSize = 977 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8313 -> 8314 -> 8370 -> 8380 ( +1, +57, +67) GFLOPS
problemSize = 978 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8330 -> 8345 -> 8407 -> 8391 ( +15, +77, +61) GFLOPS
problemSize = 979 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8369 -> 8366 -> 8411 -> 8450 ( -3, +42, +81) GFLOPS
problemSize = 980 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8399 -> 8400 -> 8464 -> 8462 ( +1, +65, +63) GFLOPS
problemSize = 981 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8407 -> 8438 -> 8456 -> 8486 ( +31, +49, +79) GFLOPS
problemSize = 982 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8437 -> 8455 -> 8487 -> 8515 ( +18, +50, +78) GFLOPS
problemSize = 983 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8492 -> 8469 -> 8523 -> 8526 ( -23, +31, +34) GFLOPS
problemSize = 984 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8540 -> 8522 -> 8598 -> 8541 ( -18, +58, +1) GFLOPS
problemSize = 985 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8431 -> 8424 -> 8421 -> 8458 ( -7, -10, +27) GFLOPS
problemSize = 986 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8454 -> 8445 -> 8460 -> 8443 ( -9, +6, -11) GFLOPS
problemSize = 987 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8479 -> 8470 -> 8487 -> 8475 ( -9, +8, -4) GFLOPS
problemSize = 988 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8502 -> 8510 -> 8522 -> 8520 ( +8, +20, +18) GFLOPS
problemSize = 989 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8525 -> 8541 -> 8525 -> 8525 ( +16, 0, 0) GFLOPS
problemSize = 990 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8541 -> 8553 -> 8547 -> 8552 ( +12, +6, +11) GFLOPS
problemSize = 991 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8572 -> 8593 -> 8584 -> 8588 ( +21, +12, +16) GFLOPS
problemSize = 992 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8637 -> 8653 -> 8664 -> 8630 ( +16, +27, -7) GFLOPS
problemSize = 993 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8557 -> 8566 -> 8586 -> 8582 ( +9, +29, +25) GFLOPS
problemSize = 994 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8592 -> 8601 -> 8611 -> 8607 ( +9, +19, +15) GFLOPS
problemSize = 995 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8614 -> 8611 -> 8616 -> 8637 ( -3, +2, +23) GFLOPS
problemSize = 996 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8683 -> 8658 -> 8652 -> 8694 ( -25, -31, +11) GFLOPS
problemSize = 997 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8679 -> 8679 -> 8682 -> 8679 ( 0, +3, 0) GFLOPS
problemSize = 998 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8695 -> 8733 -> 8715 -> 8717 ( +38, +20, +22) GFLOPS
problemSize = 999 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8723 -> 8728 -> 8726 -> 8721 ( +5, +3, -2) GFLOPS
problemSize = 1000 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8761 -> 8782 -> 8799 -> 8785 ( +21, +38, +24) GFLOPS
problemSize = 1001 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8705 -> 8708 -> 8712 -> 8702 ( +3, +7, -3) GFLOPS
problemSize = 1002 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8726 -> 8728 -> 8736 -> 8725 ( +2, +10, -1) GFLOPS
problemSize = 1003 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8743 -> 8763 -> 8757 -> 8752 ( +20, +14, +9) GFLOPS
problemSize = 1004 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8774 -> 8784 -> 8793 -> 8801 ( +10, +19, +27) GFLOPS
problemSize = 1005 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8820 -> 8810 -> 8822 -> 8806 ( -10, +2, -14) GFLOPS
problemSize = 1006 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8817 -> 8840 -> 8841 -> 8839 ( +23, +24, +22) GFLOPS
problemSize = 1007 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8859 -> 8847 -> 8859 -> 8849 ( -12, 0, -10) GFLOPS
problemSize = 1008 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8923 -> 8919 -> 8969 -> 8958 ( -4, +46, +35) GFLOPS
problemSize = 1009 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8207 -> 8210 -> 8212 -> 8198 ( +3, +5, -9) GFLOPS
problemSize = 1010 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8237 -> 8238 -> 8231 -> 8258 ( +1, -6, +21) GFLOPS
problemSize = 1011 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8248 -> 8266 -> 8268 -> 8267 ( +18, +20, +19) GFLOPS
problemSize = 1012 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8287 -> 8311 -> 8288 -> 8279 ( +24, +1, -8) GFLOPS
problemSize = 1013 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8333 -> 8295 -> 8295 -> 8295 ( -38, -38, -38) GFLOPS
problemSize = 1014 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8325 -> 8342 -> 8314 -> 8328 ( +17, -11, +3) GFLOPS
problemSize = 1015 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8380 -> 8360 -> 8357 -> 8352 ( -20, -23, -28) GFLOPS
problemSize = 1016 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8389 -> 8414 -> 8371 -> 8371 ( +25, -18, -18) GFLOPS
problemSize = 1017 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8349 -> 8354 -> 8349 -> 8351 ( +5, 0, +2) GFLOPS
problemSize = 1018 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8384 -> 8402 -> 8367 -> 8387 ( +18, -17, +3) GFLOPS
problemSize = 1019 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8414 -> 8410 -> 8410 -> 8433 ( -4, -4, +19) GFLOPS
problemSize = 1020 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8424 -> 8439 -> 8427 -> 8446 ( +15, +3, +22) GFLOPS
problemSize = 1021 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8448 -> 8446 -> 8457 -> 8445 ( -2, +9, -3) GFLOPS
problemSize = 1022 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8499 -> 8484 -> 8471 -> 8485 ( -15, -28, -14) GFLOPS
problemSize = 1023 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8495 -> 8500 -> 8502 -> 8505 ( +5, +7, +10) GFLOPS
problemSize = 1024 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8538 -> 8549 -> 8532 -> 8519 ( +11, -6, -19) GFLOPS
problemSize = 976 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8375 -> 8408 -> 8358 -> 8374 ( +33, -17, -1) GFLOPS
problemSize = 977 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8187 -> 8209 -> 8217 -> 8209 ( +22, +30, +22) GFLOPS
problemSize = 978 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8260 -> 8248 -> 8270 -> 8266 ( -12, +10, +6) GFLOPS
problemSize = 979 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8293 -> 8286 -> 8289 -> 8303 ( -7, -4, +10) GFLOPS
problemSize = 980 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8317 -> 8330 -> 8330 -> 8340 ( +13, +13, +23) GFLOPS
problemSize = 981 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8350 -> 8365 -> 8372 -> 8357 ( +15, +22, +7) GFLOPS
problemSize = 982 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8404 -> 8396 -> 8411 -> 8392 ( -8, +7, -12) GFLOPS
problemSize = 983 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8446 -> 8422 -> 8411 -> 8410 ( -24, -35, -36) GFLOPS
problemSize = 984 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8488 -> 8494 -> 8460 -> 8473 ( +6, -28, -15) GFLOPS
problemSize = 985 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8228 -> 8232 -> 8226 -> 8244 ( +4, -2, +16) GFLOPS
problemSize = 986 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8277 -> 8283 -> 8266 -> 8261 ( +6, -11, -16) GFLOPS
problemSize = 987 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8307 -> 8326 -> 8294 -> 8289 ( +19, -13, -18) GFLOPS
problemSize = 988 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8363 -> 8368 -> 8334 -> 8351 ( +5, -29, -12) GFLOPS
problemSize = 989 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8407 -> 8404 -> 8387 -> 8363 ( -3, -20, -44) GFLOPS
problemSize = 990 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8418 -> 8430 -> 8392 -> 8398 ( +12, -26, -20) GFLOPS
problemSize = 991 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8465 -> 8462 -> 8427 -> 8456 ( -3, -38, -9) GFLOPS
problemSize = 992 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8562 -> 8564 -> 8541 -> 8542 ( +2, -21, -20) GFLOPS
problemSize = 993 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8425 -> 8458 -> 8455 -> 8493 ( +33, +30, +68) GFLOPS
problemSize = 994 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8483 -> 8494 -> 8499 -> 8535 ( +11, +16, +52) GFLOPS
problemSize = 995 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8516 -> 8531 -> 8540 -> 8559 ( +15, +24, +43) GFLOPS
problemSize = 996 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8580 -> 8591 -> 8586 -> 8620 ( +11, +6, +40) GFLOPS
problemSize = 997 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8621 -> 8625 -> 8612 -> 8606 ( +4, -9, -15) GFLOPS
problemSize = 998 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8658 -> 8668 -> 8635 -> 8672 ( +10, -23, +14) GFLOPS
problemSize = 999 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8706 -> 8691 -> 8662 -> 8668 ( -15, -44, -38) GFLOPS
problemSize = 1000 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8743 -> 8752 -> 8719 -> 8744 ( +9, -24, +1) GFLOPS
problemSize = 1001 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8579 -> 8615 -> 8569 -> 8575 ( +36, -10, -4) GFLOPS
problemSize = 1002 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8650 -> 8651 -> 8628 -> 8634 ( +1, -22, -16) GFLOPS
problemSize = 1003 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8690 -> 8671 -> 8648 -> 8657 ( -19, -42, -33) GFLOPS
problemSize = 1004 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8749 -> 8760 -> 8703 -> 8696 ( +11, -46, -53) GFLOPS
problemSize = 1005 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8753 -> 8782 -> 8723 -> 8740 ( +29, -30, -13) GFLOPS
problemSize = 1006 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8786 -> 8795 -> 8757 -> 8758 ( +9, -29, -28) GFLOPS
problemSize = 1007 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8829 -> 8854 -> 8784 -> 8788 ( +25, -45, -41) GFLOPS
problemSize = 1008 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8921 -> 8931 -> 8898 -> 8884 ( +10, -23, -37) GFLOPS
problemSize = 1009 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8142 -> 8144 -> 8149 -> 8167 ( +2, +7, +25) GFLOPS
problemSize = 1010 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8193 -> 8194 -> 8194 -> 8187 ( +1, +1, -6) GFLOPS
problemSize = 1011 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8219 -> 8223 -> 8217 -> 8219 ( +4, -2, 0) GFLOPS
problemSize = 1012 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8254 -> 8285 -> 8282 -> 8256 ( +31, +28, +2) GFLOPS
problemSize = 1013 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8307 -> 8276 -> 8268 -> 8264 ( -31, -39, -43) GFLOPS
problemSize = 1014 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8313 -> 8329 -> 8307 -> 8342 ( +16, -6, +29) GFLOPS
problemSize = 1015 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8339 -> 8336 -> 8333 -> 8334 ( -3, -6, -5) GFLOPS
problemSize = 1016 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8373 -> 8389 -> 8375 -> 8384 ( +16, +2, +11) GFLOPS
problemSize = 1017 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8222 -> 8232 -> 8203 -> 8198 ( +10, -19, -24) GFLOPS
problemSize = 1018 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8250 -> 8259 -> 8258 -> 8232 ( +9, +8, -18) GFLOPS
problemSize = 1019 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8301 -> 8286 -> 8271 -> 8261 ( -15, -30, -40) GFLOPS
problemSize = 1020 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8322 -> 8338 -> 8330 -> 8324 ( +16, +8, +2) GFLOPS
problemSize = 1021 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8357 -> 8350 -> 8329 -> 8336 ( -7, -28, -21) GFLOPS
problemSize = 1022 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8374 -> 8409 -> 8353 -> 8359 ( +35, -21, -15) GFLOPS
problemSize = 1023 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8418 -> 8441 -> 8387 -> 8413 ( +23, -31, -5) GFLOPS
problemSize = 1024 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8489 -> 8497 -> 8480 -> 8487 ( +8, -9, -2) GFLOPS
FP16 (48x48x40)
problemSize = 976 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8417 -> 8404 -> 8444 -> 8453 ( -13, +27, +36) GFLOPS
problemSize = 977 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8421 -> 8443 -> 8350 -> 8368 ( +22, -71, -53) GFLOPS
problemSize = 978 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8507 -> 8493 -> 8401 -> 8393 ( -14, -106, -114) GFLOPS
problemSize = 979 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8493 -> 8500 -> 8427 -> 8420 ( +7, -66, -73) GFLOPS
problemSize = 980 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8532 -> 8551 -> 8480 -> 8503 ( +19, -52, -29) GFLOPS
problemSize = 981 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8554 -> 8580 -> 8487 -> 8494 ( +26, -67, -60) GFLOPS
problemSize = 982 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8587 -> 8635 -> 8515 -> 8521 ( +48, -72, -66) GFLOPS
problemSize = 983 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8621 -> 8632 -> 8584 -> 8552 ( +11, -37, -69) GFLOPS
problemSize = 984 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8681 -> 8646 -> 8738 -> 8724 ( -35, +57, +43) GFLOPS
problemSize = 985 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8444 -> 8433 -> 8464 -> 8438 ( -11, +20, -6) GFLOPS
problemSize = 986 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8481 -> 8500 -> 8445 -> 8477 ( +19, -36, -4) GFLOPS
problemSize = 987 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8501 -> 8509 -> 8483 -> 8486 ( +8, -18, -15) GFLOPS
problemSize = 988 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8541 -> 8570 -> 8541 -> 8504 ( +29, 0, -37) GFLOPS
problemSize = 989 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8574 -> 8572 -> 8515 -> 8541 ( -2, -59, -33) GFLOPS
problemSize = 990 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8631 -> 8617 -> 8583 -> 8575 ( -14, -48, -56) GFLOPS
problemSize = 991 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8642 -> 8650 -> 8602 -> 8584 ( +8, -40, -58) GFLOPS
problemSize = 992 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8694 -> 8698 -> 8709 -> 8703 ( +4, +15, +9) GFLOPS
// Before forcing to 1024 threads/group
problemSize = 993 | A B | 896 -> 896 -> 832 -> 832 | 8400 -> 8391 -> 7941 -> 7910 ( -9, -459, -490) GFLOPS
problemSize = 994 | A B | 896 -> 896 -> 832 -> 832 | 8439 -> 8454 -> 7943 -> 7963 ( +15, -496, -476) GFLOPS
problemSize = 995 | A B | 896 -> 896 -> 832 -> 832 | 8447 -> 8455 -> 7987 -> 7973 ( +8, -460, -474) GFLOPS
problemSize = 996 | A B | 896 -> 896 -> 832 -> 832 | 8476 -> 8500 -> 8063 -> 8024 ( +24, -413, -452) GFLOPS
problemSize = 997 | A B | 896 -> 896 -> 832 -> 832 | 8509 -> 8543 -> 8071 -> 8037 ( +34, -438, -472) GFLOPS
problemSize = 998 | A B | 896 -> 896 -> 832 -> 832 | 8593 -> 8554 -> 8094 -> 8059 ( -39, -499, -534) GFLOPS
problemSize = 999 | A B | 896 -> 896 -> 832 -> 832 | 8566 -> 8644 -> 8094 -> 8114 ( +78, -472, -452) GFLOPS
problemSize = 1000 | A B | 896 -> 896 -> 832 -> 832 | 8638 -> 8707 -> 8124 -> 8179 ( +69, -514, -459) GFLOPS
// After forcing to 1024 threads/group
problemSize = 993 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8074 -> 8097 -> 8095 -> 8104 ( +23, +21, +30) GFLOPS
problemSize = 994 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8142 -> 8139 -> 8155 -> 8136 ( -3, +13, -6) GFLOPS
problemSize = 995 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8210 -> 8141 -> 8133 -> 8184 ( -69, -77, -26) GFLOPS
problemSize = 996 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8247 -> 8214 -> 8208 -> 8208 ( -33, -39, -39) GFLOPS
problemSize = 997 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8268 -> 8261 -> 8254 -> 8283 ( -7, -14, +15) GFLOPS
problemSize = 998 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8257 -> 8278 -> 8300 -> 8274 ( +21, +43, +17) GFLOPS
problemSize = 999 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8315 -> 8316 -> 8315 -> 8278 ( +1, 0, -37) GFLOPS
problemSize = 1000 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8423 -> 8310 -> 8317 -> 8294 (-113, -106, -129) GFLOPS
problemSize = 1001 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8651 -> 8637 -> 8668 -> 8662 ( -14, +17, +11) GFLOPS
problemSize = 1002 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8662 -> 8672 -> 8701 -> 8753 ( +10, +39, +91) GFLOPS
problemSize = 1003 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8688 -> 8695 -> 8709 -> 8722 ( +7, +21, +34) GFLOPS
problemSize = 1004 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8692 -> 8706 -> 8827 -> 8741 ( +14, +135, +49) GFLOPS
problemSize = 1005 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8747 -> 8827 -> 8767 -> 8762 ( +80, +20, +15) GFLOPS
problemSize = 1006 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8785 -> 8778 -> 8812 -> 8855 ( -7, +27, +70) GFLOPS
problemSize = 1007 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8785 -> 8837 -> 8819 -> 8848 ( +52, +34, +63) GFLOPS
problemSize = 1008 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8883 -> 8899 -> 9093 -> 9036 ( +16, +210, +153) GFLOPS
problemSize = 1009 | A B | 1024 -> 1024 -> 1024 -> 1024 | 7967 -> 7983 -> 8013 -> 8002 ( +16, +46, +35) GFLOPS
problemSize = 1010 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8019 -> 7992 -> 8038 -> 8043 ( -27, +19, +24) GFLOPS
problemSize = 1011 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8061 -> 8047 -> 8050 -> 8070 ( -14, -11, +9) GFLOPS
problemSize = 1012 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8077 -> 8059 -> 8078 -> 8075 ( -18, +1, -2) GFLOPS
problemSize = 1013 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8081 -> 8084 -> 8109 -> 8111 ( +3, +28, +30) GFLOPS
problemSize = 1014 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8119 -> 8132 -> 8152 -> 8136 ( +13, +33, +17) GFLOPS
problemSize = 1015 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8138 -> 8128 -> 8153 -> 8164 ( -10, +15, +26) GFLOPS
problemSize = 1016 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8185 -> 8208 -> 8185 -> 8202 ( +23, 0, +17) GFLOPS
problemSize = 1017 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8174 -> 8175 -> 8165 -> 8164 ( +1, -9, -10) GFLOPS
problemSize = 1018 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8244 -> 8200 -> 8162 -> 8205 ( -44, -82, -39) GFLOPS
problemSize = 1019 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8226 -> 8250 -> 8202 -> 8221 ( +24, -24, -5) GFLOPS
problemSize = 1020 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8269 -> 8281 -> 8262 -> 8227 ( +12, -7, -42) GFLOPS
problemSize = 1021 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8290 -> 8283 -> 8276 -> 8253 ( -7, -14, -37) GFLOPS
problemSize = 1022 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8305 -> 8327 -> 8312 -> 8320 ( +22, +7, +15) GFLOPS
problemSize = 1023 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8357 -> 8360 -> 8314 -> 8296 ( +3, -43, -61) GFLOPS
problemSize = 1024 | A B | 1024 -> 1024 -> 1024 -> 1024 | 8386 -> 8365 -> 8342 -> 8331 ( -21, -44, -55) GFLOPS
problemSize = 976 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8426 -> 8445 -> 8603 -> 8559 ( +19, +177, +133) GFLOPS
problemSize = 977 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8206 -> 8222 -> 8309 -> 8321 ( +16, +103, +115) GFLOPS
problemSize = 978 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8272 -> 8280 -> 8404 -> 8413 ( +8, +132, +141) GFLOPS
problemSize = 979 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8360 -> 8348 -> 8495 -> 8486 ( -12, +135, +126) GFLOPS
problemSize = 980 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8432 -> 8401 -> 8489 -> 8500 ( -31, +57, +68) GFLOPS
problemSize = 981 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8454 -> 8432 -> 8528 -> 8524 ( -22, +74, +70) GFLOPS
problemSize = 982 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8472 -> 8513 -> 8608 -> 8553 ( +41, +136, +81) GFLOPS
problemSize = 983 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8528 -> 8547 -> 8601 -> 8625 ( +19, +73, +97) GFLOPS
problemSize = 984 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8536 -> 8577 -> 8599 -> 8631 ( +41, +63, +95) GFLOPS
problemSize = 985 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8259 -> 8263 -> 8363 -> 8326 ( +4, +104, +67) GFLOPS
problemSize = 986 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8345 -> 8349 -> 8395 -> 8425 ( +4, +50, +80) GFLOPS
problemSize = 987 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8362 -> 8441 -> 8529 -> 8472 ( +79, +167, +110) GFLOPS
problemSize = 988 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8502 -> 8503 -> 8571 -> 8591 ( +1, +69, +89) GFLOPS
problemSize = 989 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8500 -> 8521 -> 8583 -> 8596 ( +21, +83, +96) GFLOPS
problemSize = 990 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8567 -> 8563 -> 8655 -> 8625 ( -4, +88, +58) GFLOPS
problemSize = 991 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8542 -> 8597 -> 8661 -> 8653 ( +55, +119, +111) GFLOPS
problemSize = 992 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8692 -> 8712 -> 8740 -> 8783 ( +20, +48, +91) GFLOPS
problemSize = 993 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8583 -> 8594 -> 8650 -> 8670 ( +11, +67, +87) GFLOPS
problemSize = 994 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8684 -> 8686 -> 8752 -> 8751 ( +2, +68, +67) GFLOPS
problemSize = 995 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8752 -> 8816 -> 8824 -> 8813 ( +64, +72, +61) GFLOPS
problemSize = 996 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8849 -> 8875 -> 8908 -> 8922 ( +26, +59, +73) GFLOPS
problemSize = 997 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8914 -> 8927 -> 8940 -> 8974 ( +13, +26, +60) GFLOPS
problemSize = 998 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8946 -> 8986 -> 8982 -> 8995 ( +40, +36, +49) GFLOPS
problemSize = 999 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 9015 -> 9001 -> 9053 -> 9026 ( -14, +38, +11) GFLOPS
problemSize = 1000 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 9055 -> 9084 -> 9128 -> 9095 ( +29, +73, +40) GFLOPS
problemSize = 1001 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8501 -> 8492 -> 8610 -> 8630 ( -9, +109, +129) GFLOPS
problemSize = 1002 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8523 -> 8579 -> 8700 -> 8699 ( +56, +177, +176) GFLOPS
problemSize = 1003 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8632 -> 8615 -> 8767 -> 8715 ( -17, +135, +83) GFLOPS
problemSize = 1004 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8724 -> 8683 -> 8824 -> 8835 ( -41, +100, +111) GFLOPS
problemSize = 1005 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8690 -> 8723 -> 8883 -> 8812 ( +33, +193, +122) GFLOPS
problemSize = 1006 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8753 -> 8762 -> 8934 -> 8879 ( +9, +181, +126) GFLOPS
problemSize = 1007 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8818 -> 8837 -> 8917 -> 8880 ( +19, +99, +62) GFLOPS
problemSize = 1008 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8861 -> 8782 -> 9033 -> 9066 ( -79, +172, +205) GFLOPS
problemSize = 1009 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7909 -> 7910 -> 7997 -> 7974 ( +1, +88, +65) GFLOPS
problemSize = 1010 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 7950 -> 7965 -> 7997 -> 8025 ( +15, +47, +75) GFLOPS
problemSize = 1011 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8003 -> 8003 -> 8045 -> 8059 ( 0, +42, +56) GFLOPS
problemSize = 1012 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8060 -> 8068 -> 8078 -> 8079 ( +8, +18, +19) GFLOPS
problemSize = 1013 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8070 -> 8130 -> 8101 -> 8109 ( +60, +31, +39) GFLOPS
problemSize = 1014 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8107 -> 8111 -> 8140 -> 8127 ( +4, +33, +20) GFLOPS
problemSize = 1015 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8155 -> 8154 -> 8176 -> 8159 ( -1, +21, +4) GFLOPS
problemSize = 1016 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8146 -> 8187 -> 8266 -> 8199 ( +41, +120, +53) GFLOPS
problemSize = 1017 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8012 -> 8018 -> 8089 -> 8096 ( +6, +77, +84) GFLOPS
problemSize = 1018 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8089 -> 8069 -> 8141 -> 8145 ( -20, +52, +56) GFLOPS
problemSize = 1019 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8150 -> 8117 -> 8172 -> 8168 ( -33, +22, +18) GFLOPS
problemSize = 1020 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8174 -> 8175 -> 8220 -> 8209 ( +1, +46, +35) GFLOPS
problemSize = 1021 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8194 -> 8216 -> 8246 -> 8242 ( +22, +52, +48) GFLOPS
problemSize = 1022 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8230 -> 8225 -> 8266 -> 8296 ( -5, +36, +66) GFLOPS
problemSize = 1023 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8268 -> 8261 -> 8337 -> 8324 ( -7, +69, +56) GFLOPS
problemSize = 1024 | A B^T | 1024 -> 1024 -> 1024 -> 1024 | 8322 -> 8305 -> 8366 -> 8346 ( -17, +44, +24) GFLOPS
problemSize = 976 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8659 -> 8615 -> 8620 -> 8611 ( -44, -39, -48) GFLOPS
problemSize = 977 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8533 -> 8550 -> 8532 -> 8548 ( +17, -1, +15) GFLOPS
problemSize = 978 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8560 -> 8600 -> 8555 -> 8587 ( +40, -5, +27) GFLOPS
problemSize = 979 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8592 -> 8601 -> 8593 -> 8593 ( +9, +1, +1) GFLOPS
problemSize = 980 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8624 -> 8635 -> 8617 -> 8613 ( +11, -7, -11) GFLOPS
problemSize = 981 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8630 -> 8627 -> 8632 -> 8635 ( -3, +2, +5) GFLOPS
problemSize = 982 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8679 -> 8673 -> 8673 -> 8681 ( -6, -6, +2) GFLOPS
problemSize = 983 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8705 -> 8711 -> 8684 -> 8695 ( +6, -21, -10) GFLOPS
problemSize = 984 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8735 -> 8733 -> 8798 -> 8792 ( -2, +63, +57) GFLOPS
problemSize = 985 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8575 -> 8599 -> 8523 -> 8546 ( +24, -52, -29) GFLOPS
problemSize = 986 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8599 -> 8665 -> 8571 -> 8571 ( +66, -28, -28) GFLOPS
problemSize = 987 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8670 -> 8668 -> 8549 -> 8585 ( -2, -121, -85) GFLOPS
problemSize = 988 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8681 -> 8712 -> 8619 -> 8602 ( +31, -62, -79) GFLOPS
problemSize = 989 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8696 -> 8676 -> 8639 -> 8623 ( -20, -57, -73) GFLOPS
problemSize = 990 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8714 -> 8728 -> 8644 -> 8640 ( +14, -70, -74) GFLOPS
problemSize = 991 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8717 -> 8729 -> 8708 -> 8667 ( +12, -9, -50) GFLOPS
problemSize = 992 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8808 -> 8837 -> 8810 -> 8790 ( +29, +2, -18) GFLOPS
problemSize = 993 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8830 -> 8816 -> 8852 -> 8847 ( -14, +22, +17) GFLOPS
problemSize = 994 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8876 -> 8854 -> 8881 -> 8891 ( -22, +5, +15) GFLOPS
problemSize = 995 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8863 -> 8882 -> 8917 -> 8903 ( +19, +54, +40) GFLOPS
problemSize = 996 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8913 -> 8908 -> 8938 -> 8930 ( -5, +25, +17) GFLOPS
problemSize = 997 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8918 -> 8936 -> 8957 -> 8956 ( +18, +39, +38) GFLOPS
problemSize = 998 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8970 -> 8990 -> 9003 -> 8977 ( +20, +33, +7) GFLOPS
problemSize = 999 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8964 -> 8998 -> 9000 -> 9027 ( +34, +36, +63) GFLOPS
problemSize = 1000 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 9084 -> 9043 -> 9023 -> 9024 ( -41, -61, -60) GFLOPS
problemSize = 1001 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8821 -> 8802 -> 8845 -> 8797 ( -19, +24, -24) GFLOPS
problemSize = 1002 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8852 -> 8865 -> 8807 -> 8783 ( +13, -45, -69) GFLOPS
problemSize = 1003 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8843 -> 8924 -> 8807 -> 8819 ( +81, -36, -24) GFLOPS
problemSize = 1004 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8905 -> 8885 -> 8879 -> 8843 ( -20, -26, -62) GFLOPS
problemSize = 1005 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8901 -> 8951 -> 8889 -> 8888 ( +50, -12, -13) GFLOPS
problemSize = 1006 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8925 -> 8885 -> 8938 -> 8862 ( -40, +13, -63) GFLOPS
problemSize = 1007 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8971 -> 8972 -> 8882 -> 8887 ( +1, -89, -84) GFLOPS
problemSize = 1008 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 9017 -> 9058 -> 9007 -> 9036 ( +41, -10, +19) GFLOPS
problemSize = 1009 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8010 -> 8059 -> 8031 -> 8048 ( +49, +21, +38) GFLOPS
problemSize = 1010 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8071 -> 8073 -> 8069 -> 8075 ( +2, -2, +4) GFLOPS
problemSize = 1011 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8044 -> 8064 -> 8084 -> 8063 ( +20, +40, +19) GFLOPS
problemSize = 1012 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8098 -> 8048 -> 8106 -> 8058 ( -50, +8, -40) GFLOPS
problemSize = 1013 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8130 -> 8134 -> 8145 -> 8142 ( +4, +15, +12) GFLOPS
problemSize = 1014 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8151 -> 8183 -> 8169 -> 8133 ( +32, +18, -18) GFLOPS
problemSize = 1015 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8159 -> 8207 -> 8161 -> 8147 ( +48, +2, -12) GFLOPS
problemSize = 1016 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8200 -> 8221 -> 8228 -> 8183 ( +21, +28, -17) GFLOPS
problemSize = 1017 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8193 -> 8173 -> 8192 -> 8193 ( -20, -1, 0) GFLOPS
problemSize = 1018 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8205 -> 8217 -> 8226 -> 8221 ( +12, +21, +16) GFLOPS
problemSize = 1019 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8226 -> 8212 -> 8288 -> 8231 ( -14, +62, +5) GFLOPS
problemSize = 1020 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8263 -> 8277 -> 8260 -> 8278 ( +14, -3, +15) GFLOPS
problemSize = 1021 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8292 -> 8286 -> 8348 -> 8287 ( -6, +56, -5) GFLOPS
problemSize = 1022 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8316 -> 8304 -> 8318 -> 8354 ( -12, +2, +38) GFLOPS
problemSize = 1023 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8320 -> 8355 -> 8344 -> 8351 ( +35, +24, +31) GFLOPS
problemSize = 1024 | A^T B | 1024 -> 1024 -> 1024 -> 1024 | 8370 -> 8386 -> 8382 -> 8368 ( +16, +12, -2) GFLOPS
problemSize = 976 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8597 -> 8608 -> 8621 -> 8621 ( +11, +24, +24) GFLOPS
problemSize = 977 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8528 -> 8487 -> 8513 -> 8546 ( -41, -15, +18) GFLOPS
problemSize = 978 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8497 -> 8563 -> 8558 -> 8602 ( +66, +61, +105) GFLOPS
problemSize = 979 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8574 -> 8582 -> 8625 -> 8607 ( +8, +51, +33) GFLOPS
problemSize = 980 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8576 -> 8638 -> 8644 -> 8645 ( +62, +68, +69) GFLOPS
problemSize = 981 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8704 -> 8587 -> 8667 -> 8666 (-117, -37, -38) GFLOPS
problemSize = 982 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8686 -> 8733 -> 8687 -> 8699 ( +47, +1, +13) GFLOPS
problemSize = 983 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8705 -> 8701 -> 8715 -> 8712 ( -4, +10, +7) GFLOPS
problemSize = 984 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8768 -> 8765 -> 8808 -> 8761 ( -3, +40, -7) GFLOPS
problemSize = 985 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8554 -> 8599 -> 8561 -> 8574 ( +45, +7, +20) GFLOPS
problemSize = 986 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8611 -> 8612 -> 8613 -> 8620 ( +1, +2, +9) GFLOPS
problemSize = 987 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8640 -> 8666 -> 8656 -> 8672 ( +26, +16, +32) GFLOPS
problemSize = 988 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8691 -> 8717 -> 8725 -> 8693 ( +26, +34, +2) GFLOPS
problemSize = 989 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8720 -> 8746 -> 8716 -> 8733 ( +26, -4, +13) GFLOPS
problemSize = 990 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8756 -> 8799 -> 8748 -> 8728 ( +43, -8, -28) GFLOPS
problemSize = 991 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8786 -> 8801 -> 8788 -> 8796 ( +15, +2, +10) GFLOPS
problemSize = 992 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8851 -> 8865 -> 8863 -> 8849 ( +14, +12, -2) GFLOPS
problemSize = 993 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8661 -> 8664 -> 8658 -> 8657 ( +3, -3, -4) GFLOPS
problemSize = 994 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8688 -> 8714 -> 8710 -> 8695 ( +26, +22, +7) GFLOPS
problemSize = 995 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8730 -> 8727 -> 8775 -> 8745 ( -3, +45, +15) GFLOPS
problemSize = 996 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8774 -> 8823 -> 8775 -> 8784 ( +49, +1, +10) GFLOPS
problemSize = 997 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8801 -> 8828 -> 8806 -> 8814 ( +27, +5, +13) GFLOPS
problemSize = 998 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8846 -> 8875 -> 8858 -> 8836 ( +29, +12, -10) GFLOPS
problemSize = 999 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8869 -> 8903 -> 8876 -> 8873 ( +34, +7, +4) GFLOPS
problemSize = 1000 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8941 -> 8944 -> 8945 -> 8922 ( +3, +4, -19) GFLOPS
problemSize = 1001 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8927 -> 8899 -> 8919 -> 8903 ( -28, -8, -24) GFLOPS
problemSize = 1002 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8901 -> 8917 -> 8945 -> 8962 ( +16, +44, +61) GFLOPS
problemSize = 1003 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8951 -> 8967 -> 8987 -> 8981 ( +16, +36, +30) GFLOPS
problemSize = 1004 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 9015 -> 9030 -> 9015 -> 9016 ( +15, 0, +1) GFLOPS
problemSize = 1005 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 9023 -> 9068 -> 9040 -> 9045 ( +45, +17, +22) GFLOPS
problemSize = 1006 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 9066 -> 9087 -> 9103 -> 9076 ( +21, +37, +10) GFLOPS
problemSize = 1007 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 9090 -> 9101 -> 9092 -> 9110 ( +11, +2, +20) GFLOPS
problemSize = 1008 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 9155 -> 9163 -> 9112 -> 9116 ( +8, -43, -39) GFLOPS
problemSize = 1009 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7946 -> 8006 -> 7995 -> 7978 ( +60, +49, +32) GFLOPS
problemSize = 1010 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7987 -> 7961 -> 8044 -> 8044 ( -26, +57, +57) GFLOPS
problemSize = 1011 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8052 -> 8036 -> 8058 -> 8070 ( -16, +6, +18) GFLOPS
problemSize = 1012 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8020 -> 8098 -> 8126 -> 8082 ( +78, +106, +62) GFLOPS
problemSize = 1013 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8082 -> 8076 -> 8115 -> 8166 ( -6, +33, +84) GFLOPS
problemSize = 1014 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8094 -> 8180 -> 8151 -> 8183 ( +86, +57, +89) GFLOPS
problemSize = 1015 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8092 -> 8132 -> 8156 -> 8208 ( +40, +64, +116) GFLOPS
problemSize = 1016 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8081 -> 8130 -> 8241 -> 8137 ( +49, +160, +56) GFLOPS
problemSize = 1017 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7891 -> 7924 -> 7865 -> 7817 ( +33, -26, -74) GFLOPS
problemSize = 1018 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7985 -> 8008 -> 7929 -> 7918 ( +23, -56, -67) GFLOPS
problemSize = 1019 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8008 -> 8016 -> 7968 -> 7921 ( +8, -40, -87) GFLOPS
problemSize = 1020 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8039 -> 8051 -> 8113 -> 7985 ( +12, +74, -54) GFLOPS
problemSize = 1021 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8064 -> 8115 -> 8000 -> 8047 ( +51, -64, -17) GFLOPS
problemSize = 1022 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 7976 -> 8131 -> 8036 -> 8150 (+155, +60, +174) GFLOPS
problemSize = 1023 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8069 -> 8233 -> 8217 -> 8075 (+164, +148, +6) GFLOPS
problemSize = 1024 | A^T B^T | 1024 -> 1024 -> 1024 -> 1024 | 8163 -> 8077 -> 8077 -> 8121 ( -86, -86, -42) GFLOPS
BF16 (48x48x24)
problemSize = 976 | A B | 896 -> 896 -> 896 -> 896 | 7582 -> 7495 -> 7594 -> 7578 ( -87, +12, -4) GFLOPS
problemSize = 977 | A B | 832 -> 832 -> 768 -> 768 | 7068 -> 7075 -> 7025 -> 7028 ( +7, -43, -40) GFLOPS
problemSize = 978 | A B | 768 -> 832 -> 768 -> 768 | 7110 -> 7133 -> 7098 -> 7060 ( +23, -12, -50) GFLOPS
problemSize = 979 | A B | 832 -> 832 -> 768 -> 768 | 7171 -> 7124 -> 7109 -> 7099 ( -47, -62, -72) GFLOPS
problemSize = 980 | A B | 768 -> 832 -> 768 -> 768 | 7156 -> 7200 -> 7120 -> 7161 ( +44, -36, +5) GFLOPS
problemSize = 981 | A B | 832 -> 832 -> 768 -> 768 | 7194 -> 7180 -> 7163 -> 7127 ( -14, -31, -67) GFLOPS
problemSize = 982 | A B | 768 -> 832 -> 768 -> 768 | 7215 -> 7253 -> 7184 -> 7160 ( +38, -31, -55) GFLOPS
problemSize = 983 | A B | 832 -> 832 -> 768 -> 768 | 7234 -> 7238 -> 7200 -> 7195 ( +4, -34, -39) GFLOPS
problemSize = 984 | A B | 768 -> 768 -> 832 -> 768 | 7289 -> 7267 -> 7298 -> 7296 ( -22, +9, +7) GFLOPS
problemSize = 985 | A B | 896 -> 896 -> 896 -> 896 | 7498 -> 7490 -> 7536 -> 7542 ( -8, +38, +44) GFLOPS
problemSize = 986 | A B | 896 -> 896 -> 896 -> 896 | 7533 -> 7556 -> 7610 -> 7604 ( +23, +77, +71) GFLOPS
problemSize = 987 | A B | 896 -> 896 -> 896 -> 896 | 7575 -> 7585 -> 7617 -> 7594 ( +10, +42, +19) GFLOPS
problemSize = 988 | A B | 896 -> 896 -> 896 -> 896 | 7602 -> 7633 -> 7670 -> 7654 ( +31, +68, +52) GFLOPS
problemSize = 989 | A B | 896 -> 896 -> 896 -> 896 | 7650 -> 7637 -> 7701 -> 7692 ( -13, +51, +42) GFLOPS
problemSize = 990 | A B | 896 -> 896 -> 896 -> 896 | 7662 -> 7657 -> 7739 -> 7709 ( -5, +77, +47) GFLOPS
problemSize = 991 | A B | 896 -> 896 -> 896 -> 896 | 7671 -> 7701 -> 7723 -> 7716 ( +30, +52, +45) GFLOPS
problemSize = 992 | A B | 896 -> 896 -> 896 -> 896 | 7753 -> 7769 -> 7831 -> 7783 ( +16, +78, +30) GFLOPS
problemSize = 993 | A B | 896 -> 896 -> 896 -> 896 | 7617 -> 7619 -> 7721 -> 7708 ( +2, +104, +91) GFLOPS
problemSize = 994 | A B | 896 -> 896 -> 896 -> 896 | 7725 -> 7680 -> 7748 -> 7742 ( -45, +23, +17) GFLOPS
problemSize = 995 | A B | 896 -> 896 -> 896 -> 896 | 7680 -> 7686 -> 7804 -> 7761 ( +6, +124, +81) GFLOPS
problemSize = 996 | A B | 896 -> 896 -> 896 -> 896 | 7782 -> 7723 -> 7809 -> 7801 ( -59, +27, +19) GFLOPS
problemSize = 997 | A B | 896 -> 896 -> 896 -> 896 | 7742 -> 7739 -> 7849 -> 7837 ( -3, +107, +95) GFLOPS
problemSize = 998 | A B | 896 -> 896 -> 896 -> 896 | 7875 -> 7807 -> 7857 -> 7890 ( -68, -18, +15) GFLOPS
problemSize = 999 | A B | 896 -> 896 -> 896 -> 896 | 7834 -> 7811 -> 7924 -> 7892 ( -23, +90, +58) GFLOPS
problemSize = 1000 | A B | 896 -> 896 -> 896 -> 896 | 7903 -> 7842 -> 7955 -> 7952 ( -61, +52, +49) GFLOPS
problemSize = 1001 | A B | 832 -> 832 -> 768 -> 768 | 7391 -> 7365 -> 7324 -> 7318 ( -26, -67, -73) GFLOPS
problemSize = 1002 | A B | 768 -> 832 -> 768 -> 768 | 7407 -> 7414 -> 7327 -> 7394 ( +7, -80, -13) GFLOPS
problemSize = 1003 | A B | 832 -> 832 -> 768 -> 768 | 7444 -> 7449 -> 7359 -> 7385 ( +5, -85, -59) GFLOPS
problemSize = 1004 | A B | 768 -> 832 -> 768 -> 768 | 7475 -> 7490 -> 7442 -> 7394 ( +15, -33, -81) GFLOPS
problemSize = 1005 | A B | 832 -> 832 -> 768 -> 768 | 7510 -> 7488 -> 7469 -> 7440 ( -22, -41, -70) GFLOPS
problemSize = 1006 | A B | 768 -> 832 -> 768 -> 768 | 7541 -> 7514 -> 7470 -> 7485 ( -27, -71, -56) GFLOPS
problemSize = 1007 | A B | 832 -> 832 -> 768 -> 768 | 7519 -> 7539 -> 7485 -> 7469 ( +20, -34, -50) GFLOPS
problemSize = 1008 | A B | 768 -> 768 -> 832 -> 768 | 7551 -> 7598 -> 7618 -> 7593 ( +47, +67, +42) GFLOPS
problemSize = 1009 | A B | 896 -> 896 -> 896 -> 896 | 6779 -> 6771 -> 6797 -> 6803 ( -8, +18, +24) GFLOPS
problemSize = 1010 | A B | 896 -> 896 -> 896 -> 896 | 6809 -> 6795 -> 6836 -> 6821 ( -14, +27, +12) GFLOPS
problemSize = 1011 | A B | 896 -> 896 -> 896 -> 896 | 6832 -> 6824 -> 6876 -> 6849 ( -8, +44, +17) GFLOPS
problemSize = 1012 | A B | 896 -> 896 -> 896 -> 896 | 6861 -> 6853 -> 6899 -> 6882 ( -8, +38, +21) GFLOPS
problemSize = 1013 | A B | 896 -> 896 -> 896 -> 896 | 6871 -> 6880 -> 6925 -> 6917 ( +9, +54, +46) GFLOPS
problemSize = 1014 | A B | 896 -> 896 -> 896 -> 896 | 6935 -> 6924 -> 6944 -> 6955 ( -11, +9, +20) GFLOPS
problemSize = 1015 | A B | 896 -> 896 -> 896 -> 896 | 6917 -> 6912 -> 6972 -> 6981 ( -5, +55, +64) GFLOPS
problemSize = 1016 | A B | 896 -> 896 -> 896 -> 896 | 6978 -> 6966 -> 7003 -> 6992 ( -12, +25, +14) GFLOPS
problemSize = 1017 | A B | 896 -> 896 -> 896 -> 896 | 6867 -> 6886 -> 6927 -> 6937 ( +19, +60, +70) GFLOPS
problemSize = 1018 | A B | 896 -> 896 -> 896 -> 896 | 6954 -> 6904 -> 6968 -> 6960 ( -50, +14, +6) GFLOPS
problemSize = 1019 | A B | 896 -> 896 -> 896 -> 896 | 6939 -> 6928 -> 6976 -> 6999 ( -11, +37, +60) GFLOPS
problemSize = 1020 | A B | 896 -> 896 -> 896 -> 896 | 7006 -> 6958 -> 7014 -> 7001 ( -48, +8, -5) GFLOPS
problemSize = 1021 | A B | 896 -> 896 -> 896 -> 896 | 6962 -> 6987 -> 7030 -> 7031 ( +25, +68, +69) GFLOPS
problemSize = 1022 | A B | 896 -> 896 -> 896 -> 896 | 7040 -> 6989 -> 7033 -> 7060 ( -51, -7, +20) GFLOPS
problemSize = 1023 | A B | 896 -> 896 -> 896 -> 896 | 7014 -> 7027 -> 7053 -> 7066 ( +13, +39, +52) GFLOPS
problemSize = 1024 | A B | 896 -> 896 -> 896 -> 896 | 7109 -> 7077 -> 7134 -> 7119 ( -32, +25, +10) GFLOPS
problemSize = 976 | A B^T | 1024 -> 896 -> 896 -> 896 | 7678 -> 7696 -> 7620 -> 7664 ( +18, -58, -14) GFLOPS
problemSize = 977 | A B^T | 832 -> 832 -> 768 -> 768 | 7116 -> 7089 -> 6971 -> 6983 ( -27, -145, -133) GFLOPS
problemSize = 978 | A B^T | 832 -> 832 -> 768 -> 768 | 7131 -> 7146 -> 7071 -> 7077 ( +15, -60, -54) GFLOPS
problemSize = 979 | A B^T | 832 -> 832 -> 768 -> 768 | 7225 -> 7238 -> 7130 -> 7140 ( +13, -95, -85) GFLOPS
problemSize = 980 | A B^T | 832 -> 832 -> 768 -> 768 | 7255 -> 7336 -> 7206 -> 7184 ( +81, -49, -71) GFLOPS
problemSize = 981 | A B^T | 832 -> 832 -> 768 -> 768 | 7330 -> 7311 -> 7206 -> 7220 ( -19, -124, -110) GFLOPS
problemSize = 982 | A B^T | 832 -> 832 -> 768 -> 768 | 7345 -> 7379 -> 7254 -> 7279 ( +34, -91, -66) GFLOPS
problemSize = 983 | A B^T | 832 -> 832 -> 768 -> 768 | 7383 -> 7395 -> 7287 -> 7331 ( +12, -96, -52) GFLOPS
problemSize = 984 | A B^T | 768 -> 768 -> 832 -> 768 | 7480 -> 7425 -> 7416 -> 7414 ( -55, -64, -66) GFLOPS
problemSize = 985 | A B^T | 896 -> 896 -> 832 -> 896 | 7580 -> 7560 -> 7048 -> 7576 ( -20, -532, -4) GFLOPS
problemSize = 986 | A B^T | 896 -> 896 -> 832 -> 896 | 7551 -> 7607 -> 7083 -> 7698 ( +56, -468, +147) GFLOPS
problemSize = 987 | A B^T | 896 -> 896 -> 832 -> 896 | 7705 -> 7720 -> 7121 -> 7728 ( +15, -584, +23) GFLOPS
problemSize = 988 | A B^T | 896 -> 896 -> 832 -> 896 | 7698 -> 7805 -> 7233 -> 7789 (+107, -465, +91) GFLOPS
problemSize = 989 | A B^T | 896 -> 896 -> 832 -> 896 | 7829 -> 7838 -> 7185 -> 7849 ( +9, -644, +20) GFLOPS
problemSize = 990 | A B^T | 896 -> 896 -> 832 -> 896 | 7766 -> 7906 -> 7232 -> 7907 (+140, -534, +141) GFLOPS
problemSize = 991 | A B^T | 896 -> 896 -> 832 -> 896 | 7931 -> 7908 -> 7303 -> 7892 ( -23, -628, -39) GFLOPS
problemSize = 992 | A B^T | 896 -> 1024 -> 832 -> 896 | 7879 -> 8036 -> 7428 -> 7972 (+157, -451, +93) GFLOPS
problemSize = 993 | A B^T | 896 -> 896 -> 896 -> 896 | 7685 -> 7660 -> 7568 -> 7703 ( -25, -117, +18) GFLOPS
problemSize = 994 | A B^T | 896 -> 896 -> 896 -> 896 | 7716 -> 7751 -> 7654 -> 7798 ( +35, -62, +82) GFLOPS
problemSize = 995 | A B^T | 896 -> 896 -> 896 -> 896 | 7808 -> 7815 -> 7696 -> 7863 ( +7, -112, +55) GFLOPS
problemSize = 996 | A B^T | 896 -> 896 -> 896 -> 896 | 7861 -> 7903 -> 7773 -> 7949 ( +42, -88, +88) GFLOPS
problemSize = 997 | A B^T | 896 -> 896 -> 896 -> 896 | 7956 -> 7947 -> 7802 -> 7989 ( -9, -154, +33) GFLOPS
problemSize = 998 | A B^T | 896 -> 896 -> 896 -> 896 | 7941 -> 8001 -> 7857 -> 8006 ( +60, -84, +65) GFLOPS
problemSize = 999 | A B^T | 896 -> 896 -> 896 -> 896 | 8036 -> 8023 -> 7884 -> 8040 ( -13, -152, +4) GFLOPS
problemSize = 1000 | A B^T | 1024 -> 896 -> 896 -> 896 | 8021 -> 8092 -> 7995 -> 8056 ( +71, -26, +35) GFLOPS
problemSize = 1001 | A B^T | 832 -> 832 -> 768 -> 768 | 7324 -> 7331 -> 7160 -> 7155 ( +7, -164, -169) GFLOPS
problemSize = 1002 | A B^T | 832 -> 832 -> 768 -> 768 | 7346 -> 7400 -> 7259 -> 7238 ( +54, -87, -108) GFLOPS
problemSize = 1003 | A B^T | 832 -> 832 -> 768 -> 768 | 7440 -> 7465 -> 7261 -> 7276 ( +25, -179, -164) GFLOPS
problemSize = 1004 | A B^T | 832 -> 832 -> 768 -> 768 | 7462 -> 7528 -> 7376 -> 7346 ( +66, -86, -116) GFLOPS
problemSize = 1005 | A B^T | 832 -> 832 -> 768 -> 768 | 7560 -> 7555 -> 7372 -> 7399 ( -5, -188, -161) GFLOPS
problemSize = 1006 | A B^T | 832 -> 832 -> 768 -> 768 | 7514 -> 7621 -> 7442 -> 7454 (+107, -72, -60) GFLOPS
problemSize = 1007 | A B^T | 832 -> 832 -> 768 -> 768 | 7638 -> 7641 -> 7438 -> 7412 ( +3, -200, -226) GFLOPS
problemSize = 1008 | A B^T | 768 -> 768 -> 832 -> 768 | 7657 -> 7636 -> 7678 -> 7638 ( -21, +21, -19) GFLOPS
problemSize = 1009 | A B^T | 896 -> 896 -> 832 -> 896 | 6836 -> 6841 -> 6819 -> 6843 ( +5, -17, +7) GFLOPS
problemSize = 1010 | A B^T | 896 -> 896 -> 832 -> 896 | 6831 -> 6914 -> 6889 -> 6910 ( +83, +58, +79) GFLOPS
problemSize = 1011 | A B^T | 896 -> 896 -> 832 -> 896 | 6960 -> 6964 -> 6915 -> 6967 ( +4, -45, +7) GFLOPS
problemSize = 1012 | A B^T | 896 -> 896 -> 832 -> 896 | 6952 -> 7027 -> 6971 -> 7009 ( +75, +19, +57) GFLOPS
problemSize = 1013 | A B^T | 896 -> 896 -> 832 -> 896 | 7057 -> 7051 -> 7006 -> 7034 ( -6, -51, -23) GFLOPS
problemSize = 1014 | A B^T | 896 -> 896 -> 832 -> 896 | 6999 -> 7113 -> 7028 -> 7060 (+114, +29, +61) GFLOPS
problemSize = 1015 | A B^T | 896 -> 896 -> 832 -> 896 | 7129 -> 7108 -> 7036 -> 7078 ( -21, -93, -51) GFLOPS
problemSize = 1016 | A B^T | 896 -> 1024 -> 832 -> 896 | 7090 -> 7670 -> 7090 -> 7161 (+580, 0, +71) GFLOPS
problemSize = 1017 | A B^T | 896 -> 896 -> 896 -> 896 | 6929 -> 6933 -> 6882 -> 6970 ( +4, -47, +41) GFLOPS
problemSize = 1018 | A B^T | 896 -> 896 -> 896 -> 896 | 7029 -> 6992 -> 6932 -> 7017 ( -37, -97, -12) GFLOPS
problemSize = 1019 | A B^T | 896 -> 896 -> 896 -> 896 | 7058 -> 7041 -> 6968 -> 7078 ( -17, -90, +20) GFLOPS
problemSize = 1020 | A B^T | 896 -> 896 -> 896 -> 896 | 7136 -> 7119 -> 7055 -> 7104 ( -17, -81, -32) GFLOPS
problemSize = 1021 | A B^T | 896 -> 896 -> 896 -> 896 | 7134 -> 7139 -> 7038 -> 7131 ( +5, -96, -3) GFLOPS
problemSize = 1022 | A B^T | 896 -> 896 -> 896 -> 896 | 7178 -> 7177 -> 7062 -> 7195 ( -1, -116, +17) GFLOPS
problemSize = 1023 | A B^T | 896 -> 896 -> 896 -> 896 | 7202 -> 7215 -> 7116 -> 7188 ( +13, -86, -14) GFLOPS
problemSize = 1024 | A B^T | 1024 -> 896 -> 896 -> 896 | 7548 -> 7207 -> 7193 -> 7217 (-341, -355, -331) GFLOPS
BF16 (48x48x32)
problemSize = 976 | A B | 896 -> 896 -> 896 -> 896 | 7822 -> 7852 -> 7880 -> 7821 ( +30, +58, -1) GFLOPS
problemSize = 977 | A B | 704 -> 704 -> 768 -> 768 | 7230 -> 7245 -> 7225 -> 7337 ( +15, -5, +107) GFLOPS
problemSize = 978 | A B | 768 -> 704 -> 768 -> 768 | 7267 -> 7270 -> 7257 -> 7335 ( +3, -10, +68) GFLOPS
problemSize = 979 | A B | 704 -> 704 -> 768 -> 768 | 7287 -> 7277 -> 7264 -> 7374 ( -10, -23, +87) GFLOPS
problemSize = 980 | A B | 768 -> 704 -> 768 -> 768 | 7314 -> 7326 -> 7316 -> 7410 ( +12, +2, +96) GFLOPS
problemSize = 981 | A B | 704 -> 704 -> 768 -> 768 | 7350 -> 7360 -> 7322 -> 7478 ( +10, -28, +128) GFLOPS
problemSize = 982 | A B | 768 -> 704 -> 768 -> 768 | 7389 -> 7389 -> 7376 -> 7489 ( 0, -13, +100) GFLOPS
problemSize = 983 | A B | 704 -> 704 -> 768 -> 768 | 7384 -> 7387 -> 7444 -> 7482 ( +3, +60, +98) GFLOPS
problemSize = 984 | A B | 768 -> 704 -> 768 -> 704 | 7428 -> 7427 -> 7449 -> 7444 ( -1, +21, +16) GFLOPS
problemSize = 985 | A B | 640 -> 640 -> 640 -> 640 | 7219 -> 7244 -> 7291 -> 7224 ( +25, +72, +5) GFLOPS
problemSize = 986 | A B | 640 -> 640 -> 640 -> 640 | 7152 -> 7282 -> 7333 -> 7269 (+130, +181, +117) GFLOPS
problemSize = 987 | A B | 640 -> 640 -> 640 -> 640 | 7302 -> 7306 -> 7393 -> 7295 ( +4, +91, -7) GFLOPS
problemSize = 988 | A B | 640 -> 640 -> 640 -> 640 | 7256 -> 7367 -> 7378 -> 7351 (+111, +122, +95) GFLOPS
problemSize = 989 | A B | 640 -> 640 -> 640 -> 640 | 7371 -> 7382 -> 7402 -> 7346 ( +11, +31, -25) GFLOPS
problemSize = 990 | A B | 640 -> 640 -> 640 -> 640 | 7282 -> 7446 -> 7443 -> 7391 (+164, +161, +109) GFLOPS
problemSize = 991 | A B | 640 -> 640 -> 640 -> 640 | 7423 -> 7439 -> 7478 -> 7474 ( +16, +55, +51) GFLOPS
problemSize = 992 | A B | 640 -> 640 -> 640 -> 640 | 7491 -> 7598 -> 7616 -> 7580 (+107, +125, +89) GFLOPS
problemSize = 993 | A B | 768 -> 768 -> 768 -> 768 | 7234 -> 7219 -> 7349 -> 7251 ( -15, +115, +17) GFLOPS
problemSize = 994 | A B | 832 -> 768 -> 768 -> 768 | 7310 -> 7267 -> 7429 -> 7273 ( -43, +119, -37) GFLOPS
problemSize = 995 | A B | 768 -> 768 -> 768 -> 768 | 7274 -> 7309 -> 7389 -> 7267 ( +35, +115, -7) GFLOPS
problemSize = 996 | A B | 832 -> 768 -> 768 -> 768 | 7391 -> 7355 -> 7412 -> 7309 ( -36, +21, -82) GFLOPS
problemSize = 997 | A B | 768 -> 768 -> 768 -> 768 | 7331 -> 7355 -> 7391 -> 7349 ( +24, +60, +18) GFLOPS
problemSize = 998 | A B | 832 -> 768 -> 768 -> 768 | 7414 -> 7365 -> 7415 -> 7344 ( -49, +1, -70) GFLOPS
problemSize = 999 | A B | 768 -> 768 -> 768 -> 768 | 7381 -> 7408 -> 7496 -> 7378 ( +27, +115, -3) GFLOPS
problemSize = 1000 | A B | 832 -> 768 -> 768 -> 768 | 7507 -> 7444 -> 7541 -> 7416 ( -63, +34, -91) GFLOPS
problemSize = 1001 | A B | 896 -> 896 -> 896 -> 896 | 8107 -> 8138 -> 8109 -> 8104 ( +31, +2, -3) GFLOPS
problemSize = 1002 | A B | 896 -> 896 -> 896 -> 896 | 8116 -> 8147 -> 8146 -> 8139 ( +31, +30, +23) GFLOPS
problemSize = 1003 | A B | 896 -> 896 -> 896 -> 896 | 8185 -> 8169 -> 8173 -> 8163 ( -16, -12, -22) GFLOPS
problemSize = 1004 | A B | 896 -> 896 -> 896 -> 896 | 8183 -> 8209 -> 8217 -> 8198 ( +26, +34, +15) GFLOPS
problemSize = 1005 | A B | 896 -> 896 -> 896 -> 896 | 8239 -> 8233 -> 8229 -> 8250 ( -6, -10, +11) GFLOPS
problemSize = 1006 | A B | 896 -> 896 -> 896 -> 896 | 8238 -> 8262 -> 8272 -> 8249 ( +24, +34, +11) GFLOPS
problemSize = 1007 | A B | 896 -> 896 -> 896 -> 896 | 8286 -> 8284 -> 8298 -> 8285 ( -2, +12, -1) GFLOPS
problemSize = 1008 | A B | 896 -> 896 -> 896 -> 896 | 8336 -> 8352 -> 8350 -> 8360 ( +16, +14, +24) GFLOPS
problemSize = 1009 | A B | 704 -> 704 -> 768 -> 768 | 6800 -> 6808 -> 6966 -> 7064 ( +8, +166, +264) GFLOPS
problemSize = 1010 | A B | 768 -> 704 -> 768 -> 768 | 6988 -> 6843 -> 6973 -> 7090 (-145, -15, +102) GFLOPS
problemSize = 1011 | A B | 704 -> 704 -> 768 -> 768 | 6844 -> 6864 -> 6992 -> 7104 ( +20, +148, +260) GFLOPS
problemSize = 1012 | A B | 768 -> 704 -> 768 -> 768 | 7046 -> 6896 -> 7020 -> 7135 (-150, -26, +89) GFLOPS
problemSize = 1013 | A B | 704 -> 704 -> 768 -> 768 | 6894 -> 6902 -> 7031 -> 7159 ( +8, +137, +265) GFLOPS
problemSize = 1014 | A B | 768 -> 704 -> 768 -> 768 | 7084 -> 6915 -> 7066 -> 7166 (-169, -18, +82) GFLOPS
problemSize = 1015 | A B | 704 -> 704 -> 768 -> 768 | 6948 -> 6935 -> 7087 -> 7228 ( -13, +139, +280) GFLOPS
problemSize = 1016 | A B | 768 -> 704 -> 768 -> 768 | 7147 -> 6980 -> 7153 -> 7246 (-167, +6, +99) GFLOPS
problemSize = 1017 | A B | 640 -> 640 -> 640 -> 640 | 6971 -> 6995 -> 6991 -> 6941 ( +24, +20, -30) GFLOPS
problemSize = 1018 | A B | 640 -> 640 -> 640 -> 640 | 6858 -> 7034 -> 7042 -> 6981 (+176, +184, +123) GFLOPS
problemSize = 1019 | A B | 640 -> 640 -> 640 -> 640 | 7050 -> 7032 -> 7046 -> 6988 ( -18, -4, -62) GFLOPS
problemSize = 1020 | A B | 640 -> 640 -> 640 -> 640 | 6913 -> 7068 -> 7075 -> 7021 (+155, +162, +108) GFLOPS
problemSize = 1021 | A B | 640 -> 640 -> 640 -> 640 | 7075 -> 7070 -> 7097 -> 7068 ( -5, +22, -7) GFLOPS
problemSize = 1022 | A B | 640 -> 640 -> 640 -> 640 | 6963 -> 7099 -> 7117 -> 7071 (+136, +154, +108) GFLOPS
problemSize = 1023 | A B | 640 -> 640 -> 640 -> 640 | 7113 -> 7121 -> 7150 -> 7087 ( +8, +37, -26) GFLOPS
problemSize = 1024 | A B | 640 -> 640 -> 640 -> 640 | 7131 -> 7204 -> 7240 -> 7190 ( +73, +109, +59) GFLOPS
problemSize = 976 | A B^T | 896 -> 1024 -> 896 -> 896 | 7879 -> 8039 -> 7863 -> 7962 (+160, -16, +83) GFLOPS
problemSize = 977 | A B^T | 832 -> 832 -> 832 -> 832 | 7292 -> 7289 -> 7331 -> 7334 ( -3, +39, +42) GFLOPS
problemSize = 978 | A B^T | 832 -> 832 -> 832 -> 832 | 7315 -> 7405 -> 7416 -> 7398 ( +90, +101, +83) GFLOPS
problemSize = 979 | A B^T | 832 -> 832 -> 832 -> 832 | 7442 -> 7450 -> 7458 -> 7433 ( +8, +16, -9) GFLOPS
problemSize = 980 | A B^T | 832 -> 832 -> 832 -> 832 | 7445 -> 7526 -> 7532 -> 7510 ( +81, +87, +65) GFLOPS
problemSize = 981 | A B^T | 832 -> 832 -> 832 -> 832 | 7522 -> 7565 -> 7536 -> 7531 ( +43, +14, +9) GFLOPS
problemSize = 982 | A B^T | 832 -> 832 -> 832 -> 832 | 7524 -> 7606 -> 7575 -> 7564 ( +82, +51, +40) GFLOPS
problemSize = 983 | A B^T | 832 -> 832 -> 832 -> 832 | 7601 -> 7604 -> 7635 -> 7566 ( +3, +34, -35) GFLOPS
problemSize = 984 | A B^T | 832 -> 832 -> 768 -> 832 | 7626 -> 7655 -> 7617 -> 7648 ( +29, -9, +22) GFLOPS
problemSize = 985 | A B^T | 704 -> 704 -> 640 -> 640 | 7295 -> 7306 -> 7246 -> 7300 ( +11, -49, +5) GFLOPS
problemSize = 986 | A B^T | 704 -> 704 -> 640 -> 640 | 7373 -> 7439 -> 7342 -> 7341 ( +66, -31, -32) GFLOPS
problemSize = 987 | A B^T | 704 -> 704 -> 640 -> 640 | 7430 -> 7467 -> 7397 -> 7446 ( +37, -33, +16) GFLOPS
problemSize = 988 | A B^T | 704 -> 704 -> 640 -> 640 | 7475 -> 7502 -> 7481 -> 7485 ( +27, +6, +10) GFLOPS
problemSize = 989 | A B^T | 704 -> 704 -> 640 -> 640 | 7556 -> 7522 -> 7502 -> 7551 ( -34, -54, -5) GFLOPS
problemSize = 990 | A B^T | 704 -> 704 -> 640 -> 640 | 7590 -> 7617 -> 7531 -> 7565 ( +27, -59, -25) GFLOPS
problemSize = 991 | A B^T | 704 -> 704 -> 640 -> 640 | 7620 -> 7685 -> 7581 -> 7625 ( +65, -39, +5) GFLOPS
problemSize = 992 | A B^T | 768 -> 768 -> 704 -> 704 | 7759 -> 7763 -> 7846 -> 7876 ( +4, +87, +117) GFLOPS
problemSize = 993 | A B^T | 832 -> 832 -> 832 -> 832 | 7441 -> 7441 -> 7508 -> 7474 ( 0, +67, +33) GFLOPS
problemSize = 994 | A B^T | 832 -> 832 -> 832 -> 832 | 7388 -> 7498 -> 7616 -> 7519 (+110, +228, +131) GFLOPS
problemSize = 995 | A B^T | 832 -> 832 -> 832 -> 832 | 7553 -> 7596 -> 7599 -> 7574 ( +43, +46, +21) GFLOPS
problemSize = 996 | A B^T | 832 -> 832 -> 832 -> 832 | 7482 -> 7638 -> 7671 -> 7729 (+156, +189, +247) GFLOPS
problemSize = 997 | A B^T | 832 -> 832 -> 832 -> 832 | 7663 -> 7686 -> 7661 -> 7677 ( +23, -2, +14) GFLOPS
problemSize = 998 | A B^T | 832 -> 832 -> 832 -> 832 | 7578 -> 7731 -> 7734 -> 7677 (+153, +156, +99) GFLOPS
problemSize = 999 | A B^T | 832 -> 832 -> 832 -> 832 | 7743 -> 7742 -> 7738 -> 7734 ( -1, -5, -9) GFLOPS
problemSize = 1000 | A B^T | 832 -> 832 -> 768 -> 768 | 7744 -> 7694 -> 7730 -> 7698 ( -50, -14, -46) GFLOPS
problemSize = 1001 | A B^T | 896 -> 896 -> 832 -> 896 | 8055 -> 8070 -> 7514 -> 8110 ( +15, -541, +55) GFLOPS
problemSize = 1002 | A B^T | 896 -> 896 -> 832 -> 896 | 8114 -> 8158 -> 7603 -> 8165 ( +44, -511, +51) GFLOPS
problemSize = 1003 | A B^T | 896 -> 896 -> 832 -> 896 | 8198 -> 8216 -> 7639 -> 8228 ( +18, -559, +30) GFLOPS
problemSize = 1004 | A B^T | 896 -> 896 -> 832 -> 896 | 8230 -> 8281 -> 7691 -> 8301 ( +51, -539, +71) GFLOPS
problemSize = 1005 | A B^T | 896 -> 896 -> 832 -> 896 | 8317 -> 8311 -> 7723 -> 8319 ( -6, -594, +2) GFLOPS
problemSize = 1006 | A B^T | 896 -> 896 -> 832 -> 896 | 8282 -> 8355 -> 7817 -> 8355 ( +73, -465, +73) GFLOPS
problemSize = 1007 | A B^T | 896 -> 896 -> 832 -> 896 | 8373 -> 8390 -> 7751 -> 8393 ( +17, -622, +20) GFLOPS
problemSize = 1008 | A B^T | 896 -> 1024 -> 896 -> 1024 | 8413 -> 8561 -> 8436 -> 8507 (+148, +23, +94) GFLOPS
problemSize = 1009 | A B^T | 832 -> 832 -> 832 -> 832 | 7128 -> 7141 -> 7209 -> 7180 ( +13, +81, +52) GFLOPS
problemSize = 1010 | A B^T | 832 -> 832 -> 832 -> 832 | 7147 -> 7203 -> 7258 -> 7179 ( +56, +111, +32) GFLOPS
problemSize = 1011 | A B^T | 832 -> 832 -> 832 -> 832 | 7239 -> 7242 -> 7305 -> 7252 ( +3, +66, +13) GFLOPS
problemSize = 1012 | A B^T | 832 -> 832 -> 832 -> 832 | 7225 -> 7304 -> 7328 -> 7322 ( +79, +103, +97) GFLOPS
problemSize = 1013 | A B^T | 832 -> 832 -> 832 -> 832 | 7350 -> 7311 -> 7365 -> 7306 ( -39, +15, -44) GFLOPS
problemSize = 1014 | A B^T | 832 -> 832 -> 832 -> 832 | 7289 -> 7357 -> 7423 -> 7346 ( +68, +134, +57) GFLOPS
problemSize = 1015 | A B^T | 832 -> 832 -> 832 -> 832 | 7371 -> 7385 -> 7399 -> 7390 ( +14, +28, +19) GFLOPS
problemSize = 1016 | A B^T | 832 -> 832 -> 768 -> 832 | 7338 -> 7454 -> 7367 -> 7402 (+116, +29, +64) GFLOPS
problemSize = 1017 | A B^T | 704 -> 704 -> 640 -> 640 | 7014 -> 7018 -> 7020 -> 7025 ( +4, +6, +11) GFLOPS
problemSize = 1018 | A B^T | 704 -> 704 -> 640 -> 640 | 7064 -> 7065 -> 7080 -> 7092 ( +1, +16, +28) GFLOPS
problemSize = 1019 | A B^T | 704 -> 704 -> 640 -> 640 | 7102 -> 7103 -> 7132 -> 7132 ( +1, +30, +30) GFLOPS
problemSize = 1020 | A B^T | 704 -> 704 -> 640 -> 640 | 7150 -> 7161 -> 7186 -> 7176 ( +11, +36, +26) GFLOPS
problemSize = 1021 | A B^T | 704 -> 704 -> 640 -> 640 | 7216 -> 7197 -> 7203 -> 7211 ( -19, -13, -5) GFLOPS
problemSize = 1022 | A B^T | 704 -> 704 -> 640 -> 640 | 7208 -> 7217 -> 7242 -> 7258 ( +9, +34, +50) GFLOPS
problemSize = 1023 | A B^T | 704 -> 704 -> 640 -> 640 | 7247 -> 7241 -> 7259 -> 7298 ( -6, +12, +51) GFLOPS
problemSize = 1024 | A B^T | 768 -> 768 -> 704 -> 704 | 7681 -> 7635 -> 7393 -> 7391 ( -46, -288, -290) GFLOPS
BF16 (48x48x40)
problemSize = 976 | A B | 768 -> 768 -> 768 -> 768 | 7446 -> 7528 -> 7459 -> 7468 ( +82, +13, +22) GFLOPS
problemSize = 977 | A B | 768 -> 768 -> 704 -> 768 | 7480 -> 7473 -> 7455 -> 7481 ( -7, -25, +1) GFLOPS
problemSize = 978 | A B | 768 -> 768 -> 704 -> 768 | 7481 -> 7492 -> 7497 -> 7479 ( +11, +16, -2) GFLOPS
problemSize = 979 | A B | 768 -> 768 -> 704 -> 768 | 7518 -> 7561 -> 7517 -> 7521 ( +43, -1, +3) GFLOPS
problemSize = 980 | A B | 768 -> 768 -> 704 -> 768 | 7588 -> 7544 -> 7593 -> 7541 ( -44, +5, -47) GFLOPS
problemSize = 981 | A B | 768 -> 768 -> 704 -> 768 | 7591 -> 7579 -> 7588 -> 7564 ( -12, -3, -27) GFLOPS
problemSize = 982 | A B | 768 -> 768 -> 704 -> 768 | 7589 -> 7605 -> 7608 -> 7620 ( +16, +19, +31) GFLOPS
problemSize = 983 | A B | 768 -> 768 -> 704 -> 768 | 7623 -> 7623 -> 7631 -> 7596 ( 0, +8, -27) GFLOPS
problemSize = 984 | A B | 768 -> 768 -> 768 -> 768 | 7667 -> 7704 -> 7671 -> 7697 ( +37, +4, +30) GFLOPS
problemSize = 985 | A B | 640 -> 640 -> 640 -> 640 | 7339 -> 7316 -> 7378 -> 7436 ( -23, +39, +97) GFLOPS
problemSize = 986 | A B | 640 -> 640 -> 640 -> 640 | 7374 -> 7360 -> 7414 -> 7427 ( -14, +40, +53) GFLOPS
problemSize = 987 | A B | 640 -> 640 -> 640 -> 640 | 7340 -> 7385 -> 7426 -> 7473 ( +45, +86, +133) GFLOPS
problemSize = 988 | A B | 640 -> 640 -> 640 -> 640 | 7402 -> 7434 -> 7491 -> 7530 ( +32, +89, +128) GFLOPS
problemSize = 989 | A B | 640 -> 640 -> 640 -> 640 | 7453 -> 7467 -> 7463 -> 7491 ( +14, +10, +38) GFLOPS
problemSize = 990 | A B | 640 -> 640 -> 640 -> 640 | 7457 -> 7490 -> 7537 -> 7560 ( +33, +80, +103) GFLOPS
problemSize = 991 | A B | 640 -> 640 -> 640 -> 640 | 7495 -> 7489 -> 7526 -> 7605 ( -6, +31, +110) GFLOPS
problemSize = 992 | A B | 640 -> 640 -> 640 -> 640 | 7545 -> 7584 -> 7613 -> 7682 ( +39, +68, +137) GFLOPS
problemSize = 993 | A B | 576 -> 576 -> 576 -> 576 | 7015 -> 6994 -> 6938 -> 6993 ( -21, -77, -22) GFLOPS
problemSize = 994 | A B | 576 -> 576 -> 576 -> 576 | 7028 -> 7060 -> 6976 -> 7039 ( +32, -52, +11) GFLOPS
problemSize = 995 | A B | 576 -> 576 -> 576 -> 576 | 7049 -> 7060 -> 7008 -> 7041 ( +11, -41, -8) GFLOPS
problemSize = 996 | A B | 576 -> 576 -> 576 -> 576 | 7110 -> 7089 -> 7035 -> 7084 ( -21, -75, -26) GFLOPS
problemSize = 997 | A B | 576 -> 576 -> 576 -> 576 | 7132 -> 7108 -> 7037 -> 7111 ( -24, -95, -21) GFLOPS
problemSize = 998 | A B | 576 -> 576 -> 576 -> 576 | 7149 -> 7143 -> 7088 -> 7151 ( -6, -61, +2) GFLOPS
problemSize = 999 | A B | 576 -> 576 -> 576 -> 576 | 7165 -> 7156 -> 7095 -> 7150 ( -9, -70, -15) GFLOPS
problemSize = 1000 | A B | 576 -> 576 -> 576 -> 576 | 7310 -> 7307 -> 7285 -> 7314 ( -3, -25, +4) GFLOPS
problemSize = 1001 | A B | 640 -> 640 -> 640 -> 640 | 7586 -> 7602 -> 7612 -> 7601 ( +16, +26, +15) GFLOPS
problemSize = 1002 | A B | 640 -> 640 -> 640 -> 640 | 7626 -> 7634 -> 7680 -> 7642 ( +8, +54, +16) GFLOPS
problemSize = 1003 | A B | 640 -> 640 -> 640 -> 640 | 7640 -> 7629 -> 7697 -> 7662 ( -11, +57, +22) GFLOPS
problemSize = 1004 | A B | 640 -> 640 -> 640 -> 640 | 7686 -> 7710 -> 7698 -> 7731 ( +24, +12, +45) GFLOPS
problemSize = 1005 | A B | 640 -> 640 -> 640 -> 640 | 7680 -> 7669 -> 7742 -> 7709 ( -11, +62, +29) GFLOPS
problemSize = 1006 | A B | 640 -> 640 -> 640 -> 640 | 7763 -> 7730 -> 7773 -> 7730 ( -33, +10, -33) GFLOPS
problemSize = 1007 | A B | 640 -> 640 -> 640 -> 640 | 7786 -> 7742 -> 7771 -> 7816 ( -44, -15, +30) GFLOPS
problemSize = 1008 | A B | 640 -> 640 -> 640 -> 640 | 7812 -> 7813 -> 7853 -> 7815 ( +1, +41, +3) GFLOPS
problemSize = 1009 | A B | 768 -> 768 -> 768 -> 768 | 7310 -> 7291 -> 7276 -> 7294 ( -19, -34, -16) GFLOPS
problemSize = 1010 | A B | 768 -> 768 -> 768 -> 768 | 7285 -> 7340 -> 7291 -> 7320 ( +55, +6, +35) GFLOPS
problemSize = 1011 | A B | 768 -> 768 -> 768 -> 768 | 7359 -> 7352 -> 7312 -> 7337 ( -7, -47, -22) GFLOPS
problemSize = 1012 | A B | 768 -> 768 -> 768 -> 768 | 7329 -> 7375 -> 7352 -> 7372 ( +46, +23, +43) GFLOPS
problemSize = 1013 | A B | 768 -> 768 -> 768 -> 768 | 7429 -> 7432 -> 7415 -> 7411 ( +3, -14, -18) GFLOPS
problemSize = 1014 | A B | 768 -> 768 -> 768 -> 768 | 7368 -> 7432 -> 7422 -> 7430 ( +64, +54, +62) GFLOPS
problemSize = 1015 | A B | 768 -> 768 -> 768 -> 768 | 7462 -> 7466 -> 7422 -> 7437 ( +4, -40, -25) GFLOPS
problemSize = 1016 | A B | 768 -> 768 -> 768 -> 768 | 7434 -> 7508 -> 7450 -> 7469 ( +74, +16, +35) GFLOPS
problemSize = 1017 | A B | 768 -> 768 -> 704 -> 768 | 7365 -> 7374 -> 7060 -> 7344 ( +9, -305, -21) GFLOPS
problemSize = 1018 | A B | 768 -> 768 -> 704 -> 768 | 7402 -> 7437 -> 7113 -> 7375 ( +35, -289, -27) GFLOPS
problemSize = 1019 | A B | 768 -> 768 -> 704 -> 768 | 7412 -> 7412 -> 7117 -> 7388 ( 0, -295, -24) GFLOPS
problemSize = 1020 | A B | 768 -> 768 -> 704 -> 768 | 7421 -> 7448 -> 7162 -> 7420 ( +27, -259, -1) GFLOPS
problemSize = 1021 | A B | 768 -> 768 -> 704 -> 768 | 7466 -> 7484 -> 7163 -> 7433 ( +18, -303, -33) GFLOPS
problemSize = 1022 | A B | 768 -> 768 -> 704 -> 768 | 7478 -> 7484 -> 7190 -> 7477 ( +6, -288, -1) GFLOPS
problemSize = 1023 | A B | 768 -> 768 -> 704 -> 768 | 7508 -> 7512 -> 7216 -> 7504 ( +4, -292, -4) GFLOPS
problemSize = 1024 | A B | 768 -> 768 -> 704 -> 768 | 7531 -> 7548 -> 7251 -> 7561 ( +17, -280, +30) GFLOPS
problemSize = 976 | A B^T | 832 -> 832 -> 768 -> 832 | 7744 -> 7793 -> 7706 -> 7749 ( +49, -38, +5) GFLOPS
problemSize = 977 | A B^T | 832 -> 832 -> 832 -> 832 | 7549 -> 7549 -> 7545 -> 7606 ( 0, -4, +57) GFLOPS
problemSize = 978 | A B^T | 832 -> 832 -> 832 -> 832 | 7600 -> 7622 -> 7601 -> 7657 ( +22, +1, +57) GFLOPS
problemSize = 979 | A B^T | 832 -> 832 -> 832 -> 832 | 7673 -> 7674 -> 7646 -> 7685 ( +1, -27, +12) GFLOPS
problemSize = 980 | A B^T | 832 -> 832 -> 832 -> 832 | 7690 -> 7768 -> 7703 -> 7747 ( +78, +13, +57) GFLOPS
problemSize = 981 | A B^T | 832 -> 832 -> 832 -> 832 | 7773 -> 7771 -> 7716 -> 7750 ( -2, -57, -23) GFLOPS
problemSize = 982 | A B^T | 832 -> 832 -> 832 -> 832 | 7774 -> 7812 -> 7774 -> 7799 ( +38, 0, +25) GFLOPS
problemSize = 983 | A B^T | 832 -> 832 -> 832 -> 832 | 7861 -> 7837 -> 7788 -> 7823 ( -24, -73, -38) GFLOPS
problemSize = 984 | A B^T | 832 -> 832 -> 832 -> 832 | 7864 -> 7894 -> 7837 -> 7865 ( +30, -27, +1) GFLOPS
problemSize = 985 | A B^T | 704 -> 704 -> 704 -> 704 | 7604 -> 7651 -> 7503 -> 7641 ( +47, -101, +37) GFLOPS
problemSize = 986 | A B^T | 704 -> 704 -> 704 -> 704 | 7208 -> 7642 -> 7660 -> 7699 (+434, +452, +491) GFLOPS
problemSize = 987 | A B^T | 704 -> 704 -> 704 -> 704 | 7731 -> 7705 -> 7696 -> 7768 ( -26, -35, +37) GFLOPS
problemSize = 988 | A B^T | 704 -> 704 -> 704 -> 704 | 7314 -> 7824 -> 7658 -> 7812 (+510, +344, +498) GFLOPS
problemSize = 989 | A B^T | 704 -> 704 -> 704 -> 704 | 7849 -> 7823 -> 7761 -> 7817 ( -26, -88, -32) GFLOPS
problemSize = 990 | A B^T | 704 -> 704 -> 704 -> 704 | 7376 -> 7876 -> 7777 -> 7884 (+500, +401, +508) GFLOPS
problemSize = 991 | A B^T | 704 -> 704 -> 704 -> 704 | 7913 -> 7888 -> 7747 -> 7918 ( -25, -166, +5) GFLOPS
problemSize = 992 | A B^T | 640 -> 704 -> 704 -> 704 | 7801 -> 7898 -> 7816 -> 7975 ( +97, +15, +174) GFLOPS
problemSize = 993 | A B^T | 576 -> 576 -> 576 -> 576 | 7123 -> 7129 -> 7110 -> 7062 ( +6, -13, -61) GFLOPS
problemSize = 994 | A B^T | 576 -> 576 -> 576 -> 576 | 7204 -> 7221 -> 7156 -> 7153 ( +17, -48, -51) GFLOPS
problemSize = 995 | A B^T | 576 -> 576 -> 576 -> 576 | 7256 -> 7234 -> 7232 -> 7163 ( -22, -24, -93) GFLOPS
problemSize = 996 | A B^T | 576 -> 576 -> 576 -> 576 | 7277 -> 7284 -> 7278 -> 7234 ( +7, +1, -43) GFLOPS
problemSize = 997 | A B^T | 576 -> 576 -> 576 -> 576 | 7311 -> 7312 -> 7295 -> 7258 ( +1, -16, -53) GFLOPS
problemSize = 998 | A B^T | 576 -> 576 -> 576 -> 576 | 7333 -> 7359 -> 7342 -> 7291 ( +26, +9, -42) GFLOPS
problemSize = 999 | A B^T | 576 -> 576 -> 576 -> 576 | 7367 -> 7369 -> 7353 -> 7306 ( +2, -14, -61) GFLOPS
problemSize = 1000 | A B^T | 576 -> 576 -> 576 -> 576 | 7373 -> 7385 -> 7348 -> 7382 ( +12, -25, +9) GFLOPS
problemSize = 1001 | A B^T | 704 -> 704 -> 640 -> 704 | 7704 -> 7677 -> 7640 -> 7801 ( -27, -64, +97) GFLOPS
problemSize = 1002 | A B^T | 640 -> 704 -> 640 -> 704 | 7723 -> 7778 -> 7710 -> 7804 ( +55, -13, +81) GFLOPS
problemSize = 1003 | A B^T | 704 -> 704 -> 640 -> 704 | 7811 -> 7865 -> 7798 -> 7857 ( +54, -13, +46) GFLOPS
problemSize = 1004 | A B^T | 640 -> 704 -> 640 -> 704 | 7831 -> 7864 -> 7779 -> 7963 ( +33, -52, +132) GFLOPS
problemSize = 1005 | A B^T | 704 -> 704 -> 640 -> 704 | 7917 -> 7941 -> 7841 -> 7901 ( +24, -76, -16) GFLOPS
problemSize = 1006 | A B^T | 640 -> 704 -> 640 -> 704 | 7895 -> 7958 -> 7885 -> 7971 ( +63, -10, +76) GFLOPS
problemSize = 1007 | A B^T | 704 -> 704 -> 640 -> 704 | 7985 -> 7997 -> 7861 -> 7971 ( +12, -124, -14) GFLOPS
problemSize = 1008 | A B^T | 640 -> 704 -> 640 -> 704 | 7897 -> 8037 -> 8031 -> 8100 (+140, +134, +203) GFLOPS
problemSize = 1009 | A B^T | 832 -> 832 -> 832 -> 832 | 7462 -> 7454 -> 7399 -> 7515 ( -8, -63, +53) GFLOPS
problemSize = 1010 | A B^T | 832 -> 832 -> 832 -> 832 | 7403 -> 7474 -> 7422 -> 7493 ( +71, +19, +90) GFLOPS
problemSize = 1011 | A B^T | 832 -> 832 -> 832 -> 832 | 7496 -> 7506 -> 7445 -> 7530 ( +10, -51, +34) GFLOPS
problemSize = 1012 | A B^T | 832 -> 832 -> 832 -> 832 | 7438 -> 7565 -> 7475 -> 7607 (+127, +37, +169) GFLOPS
problemSize = 1013 | A B^T | 832 -> 832 -> 832 -> 832 | 7586 -> 7600 -> 7504 -> 7595 ( +14, -82, +9) GFLOPS
problemSize = 1014 | A B^T | 832 -> 832 -> 832 -> 832 | 7531 -> 7612 -> 7519 -> 7614 ( +81, -12, +83) GFLOPS
problemSize = 1015 | A B^T | 832 -> 832 -> 832 -> 832 | 7643 -> 7643 -> 7568 -> 7658 ( 0, -75, +15) GFLOPS
problemSize = 1016 | A B^T | 832 -> 832 -> 768 -> 832 | 7634 -> 7671 -> 7613 -> 7695 ( +37, -21, +61) GFLOPS
problemSize = 1017 | A B^T | 832 -> 832 -> 832 -> 832 | 7508 -> 7531 -> 7474 -> 7559 ( +23, -34, +51) GFLOPS
problemSize = 1018 | A B^T | 832 -> 832 -> 832 -> 832 | 7526 -> 7568 -> 7553 -> 7607 ( +42, +27, +81) GFLOPS
problemSize = 1019 | A B^T | 832 -> 832 -> 832 -> 832 | 7619 -> 7642 -> 7529 -> 7637 ( +23, -90, +18) GFLOPS
problemSize = 1020 | A B^T | 832 -> 832 -> 832 -> 832 | 7585 -> 7667 -> 7600 -> 7666 ( +82, +15, +81) GFLOPS
problemSize = 1021 | A B^T | 832 -> 832 -> 832 -> 832 | 7680 -> 7684 -> 7610 -> 7699 ( +4, -70, +19) GFLOPS
problemSize = 1022 | A B^T | 832 -> 832 -> 832 -> 832 | 7660 -> 7722 -> 7648 -> 7720 ( +62, -12, +60) GFLOPS
problemSize = 1023 | A B^T | 832 -> 832 -> 832 -> 832 | 7754 -> 7768 -> 7653 -> 7736 ( +14, -101, -18) GFLOPS
problemSize = 1024 | A B^T | 832 -> 832 -> 768 -> 832 | 7719 -> 7780 -> 7723 -> 7809 ( +61, +4, +90) GFLOPS
### M4 Statistics ###
0 - device store
1 - device store, with extra threadgroup memory allocation
2 - threadgroup store
FP32 (32x32x8)
problemSize = 976 | A B | 1024 -> 1024 -> 1024 | 2944 -> 2939 -> 2946 ( -5, +2) GFLOPS
problemSize = 977 | A B | 1024 -> 1024 -> 1024 | 2866 -> 2927 -> 2912 ( +61, +46) GFLOPS
problemSize = 978 | A B | 1024 -> 1024 -> 1024 | 2922 -> 2934 -> 2927 ( +12, +5) GFLOPS
problemSize = 979 | A B | 1024 -> 1024 -> 1024 | 2880 -> 2940 -> 2930 ( +60, +50) GFLOPS
problemSize = 980 | A B | 1024 -> 1024 -> 1024 | 2947 -> 2953 -> 2945 ( +6, -2) GFLOPS
problemSize = 981 | A B | 1024 -> 1024 -> 1024 | 2902 -> 2959 -> 2951 ( +57, +49) GFLOPS
problemSize = 982 | A B | 1024 -> 1024 -> 1024 | 2954 -> 2971 -> 2957 ( +17, +3) GFLOPS
problemSize = 983 | A B | 1024 -> 1024 -> 1024 | 2917 -> 2983 -> 2966 ( +66, +49) GFLOPS
problemSize = 984 | A B | 1024 -> 1024 -> 1024 | 2985 -> 2981 -> 2995 ( -4, +10) GFLOPS
problemSize = 985 | A B | 1024 -> 1024 -> 1024 | 2911 -> 2965 -> 2956 ( +54, +45) GFLOPS
problemSize = 986 | A B | 1024 -> 1024 -> 1024 | 2969 -> 2985 -> 2963 ( +16, -6) GFLOPS
problemSize = 987 | A B | 1024 -> 1024 -> 1024 | 2919 -> 2975 -> 2964 ( +56, +45) GFLOPS
problemSize = 988 | A B | 1024 -> 1024 -> 1024 | 2992 -> 2995 -> 2988 ( +3, -4) GFLOPS
problemSize = 989 | A B | 1024 -> 1024 -> 1024 | 2933 -> 2996 -> 2986 ( +63, +53) GFLOPS
problemSize = 990 | A B | 1024 -> 1024 -> 1024 | 3003 -> 3016 -> 2996 ( +13, -7) GFLOPS
problemSize = 991 | A B | 1024 -> 1024 -> 1024 | 2940 -> 3024 -> 2997 ( +84, +57) GFLOPS
problemSize = 992 | A B | 1024 -> 1024 -> 1024 | 3066 -> 3075 -> 3054 ( +9, -12) GFLOPS
problemSize = 993 | A B | 1024 -> 1024 -> 1024 | 2768 -> 2845 -> 2837 ( +77, +69) GFLOPS
problemSize = 994 | A B | 1024 -> 1024 -> 1024 | 2819 -> 2850 -> 2852 ( +31, +33) GFLOPS
problemSize = 995 | A B | 1024 -> 1024 -> 1024 | 2792 -> 2851 -> 2835 ( +59, +43) GFLOPS
problemSize = 996 | A B | 1024 -> 1024 -> 1024 | 2864 -> 2879 -> 2872 ( +15, +8) GFLOPS
problemSize = 997 | A B | 1024 -> 1024 -> 1024 | 2809 -> 2861 -> 2853 ( +52, +44) GFLOPS
problemSize = 998 | A B | 1024 -> 1024 -> 1024 | 2855 -> 2886 -> 2875 ( +31, +20) GFLOPS
problemSize = 999 | A B | 1024 -> 1024 -> 1024 | 2824 -> 2870 -> 2859 ( +46, +35) GFLOPS
problemSize = 1000 | A B | 1024 -> 1024 -> 1024 | 2891 -> 2894 -> 2912 ( +3, +21) GFLOPS
problemSize = 1001 | A B | 1024 -> 1024 -> 1024 | 2810 -> 2855 -> 2863 ( +45, +53) GFLOPS
problemSize = 1002 | A B | 1024 -> 1024 -> 1024 | 2861 -> 2893 -> 2892 ( +32, +31) GFLOPS
problemSize = 1003 | A B | 1024 -> 1024 -> 1024 | 2840 -> 2865 -> 2866 ( +25, +26) GFLOPS
problemSize = 1004 | A B | 1024 -> 1024 -> 1024 | 2914 -> 2919 -> 2917 ( +5, +3) GFLOPS
problemSize = 1005 | A B | 1024 -> 1024 -> 1024 | 2839 -> 2873 -> 2874 ( +34, +35) GFLOPS
problemSize = 1006 | A B | 1024 -> 1024 -> 1024 | 2879 -> 2914 -> 2915 ( +35, +36) GFLOPS
problemSize = 1007 | A B | 1024 -> 1024 -> 1024 | 2838 -> 2890 -> 2899 ( +52, +61) GFLOPS
problemSize = 1008 | A B | 1024 -> 1024 -> 1024 | 2937 -> 2934 -> 2963 ( -3, +26) GFLOPS
problemSize = 1009 | A B | 1024 -> 1024 -> 1024 | 2845 -> 2883 -> 2867 ( +38, +22) GFLOPS
problemSize = 1010 | A B | 1024 -> 1024 -> 1024 | 2875 -> 2913 -> 2900 ( +38, +25) GFLOPS
problemSize = 1011 | A B | 1024 -> 1024 -> 1024 | 2859 -> 2886 -> 2875 ( +27, +16) GFLOPS
problemSize = 1012 | A B | 1024 -> 1024 -> 1024 | 2948 -> 2965 -> 2955 ( +17, +7) GFLOPS
problemSize = 1013 | A B | 1024 -> 1024 -> 1024 | 2845 -> 2882 -> 2864 ( +37, +19) GFLOPS
problemSize = 1014 | A B | 1024 -> 1024 -> 1024 | 2888 -> 2939 -> 2915 ( +51, +27) GFLOPS
problemSize = 1015 | A B | 1024 -> 1024 -> 1024 | 2850 -> 2889 -> 2875 ( +39, +25) GFLOPS
problemSize = 1016 | A B | 1024 -> 1024 -> 1024 | 2982 -> 2991 -> 2991 ( +9, +9) GFLOPS
problemSize = 1017 | A B | 1024 -> 1024 -> 1024 | 2849 -> 2882 -> 2863 ( +33, +14) GFLOPS
problemSize = 1018 | A B | 1024 -> 1024 -> 1024 | 2898 -> 2939 -> 2925 ( +41, +27) GFLOPS
problemSize = 1019 | A B | 1024 -> 1024 -> 1024 | 2863 -> 2894 -> 2876 ( +31, +13) GFLOPS
problemSize = 1020 | A B | 1024 -> 1024 -> 1024 | 2986 -> 3004 -> 2994 ( +18, +8) GFLOPS
problemSize = 1021 | A B | 1024 -> 1024 -> 1024 | 2890 -> 2901 -> 2896 ( +11, +6) GFLOPS
problemSize = 1022 | A B | 1024 -> 1024 -> 1024 | 2942 -> 2976 -> 2961 ( +34, +19) GFLOPS
problemSize = 1023 | A B | 1024 -> 1024 -> 1024 | 2926 -> 2946 -> 2931 ( +20, +5) GFLOPS
problemSize = 1024 | A B | 1024 -> 1024 -> 1024 | 3073 -> 3073 -> 3050 ( 0, -23) GFLOPS
problemSize = 976 | A B^T | 1024 -> 1024 -> 1024 | 2922 -> 2917 -> 2909 ( -5, -13) GFLOPS
problemSize = 977 | A B^T | 1024 -> 1024 -> 1024 | 2411 -> 2618 -> 2643 (+207, +232) GFLOPS
problemSize = 978 | A B^T | 1024 -> 1024 -> 1024 | 2512 -> 2776 -> 2778 (+264, +266) GFLOPS
problemSize = 979 | A B^T | 1024 -> 1024 -> 1024 | 2422 -> 2646 -> 2654 (+224, +232) GFLOPS
problemSize = 980 | A B^T | 1024 -> 1024 -> 1024 | 2759 -> 2933 -> 2910 (+174, +151) GFLOPS
problemSize = 981 | A B^T | 1024 -> 1024 -> 1024 | 2440 -> 2674 -> 2675 (+234, +235) GFLOPS
problemSize = 982 | A B^T | 1024 -> 1024 -> 1024 | 2555 -> 2828 -> 2826 (+273, +271) GFLOPS
problemSize = 983 | A B^T | 1024 -> 1024 -> 1024 | 2465 -> 2725 -> 2727 (+260, +262) GFLOPS
problemSize = 984 | A B^T | 1024 -> 1024 -> 1024 | 2961 -> 2955 -> 2948 ( -6, -13) GFLOPS
problemSize = 985 | A B^T | 1024 -> 1024 -> 1024 | 2465 -> 2723 -> 2726 (+258, +261) GFLOPS
problemSize = 986 | A B^T | 1024 -> 1024 -> 1024 | 2581 -> 2870 -> 2860 (+289, +279) GFLOPS
problemSize = 987 | A B^T | 1024 -> 1024 -> 1024 | 2467 -> 2707 -> 2712 (+240, +245) GFLOPS
problemSize = 988 | A B^T | 1024 -> 1024 -> 1024 | 2798 -> 2988 -> 2959 (+190, +161) GFLOPS
problemSize = 989 | A B^T | 1024 -> 1024 -> 1024 | 2482 -> 2725 -> 2729 (+243, +247) GFLOPS
problemSize = 990 | A B^T | 1024 -> 1024 -> 1024 | 2591 -> 2871 -> 2869 (+280, +278) GFLOPS
problemSize = 991 | A B^T | 1024 -> 1024 -> 1024 | 2477 -> 2759 -> 2742 (+282, +265) GFLOPS
problemSize = 992 | A B^T | 1024 -> 1024 -> 1024 | 3034 -> 3033 -> 3018 ( -1, -16) GFLOPS
problemSize = 993 | A B^T | 1024 -> 1024 -> 1024 | 2325 -> 2567 -> 2595 (+242, +270) GFLOPS
problemSize = 994 | A B^T | 1024 -> 1024 -> 1024 | 2423 -> 2720 -> 2726 (+297, +303) GFLOPS
problemSize = 995 | A B^T | 1024 -> 1024 -> 1024 | 2352 -> 2608 -> 2607 (+256, +255) GFLOPS
problemSize = 996 | A B^T | 1024 -> 1024 -> 1024 | 2675 -> 2851 -> 2834 (+176, +159) GFLOPS
problemSize = 997 | A B^T | 1024 -> 1024 -> 1024 | 2392 -> 2629 -> 2650 (+237, +258) GFLOPS
problemSize = 998 | A B^T | 1024 -> 1024 -> 1024 | 2508 -> 2788 -> 2788 (+280, +280) GFLOPS
problemSize = 999 | A B^T | 1024 -> 1024 -> 1024 | 2428 -> 2712 -> 2698 (+284, +270) GFLOPS
problemSize = 1000 | A B^T | 1024 -> 1024 -> 1024 | 2852 -> 2857 -> 2870 ( +5, +18) GFLOPS
problemSize = 1001 | A B^T | 1024 -> 1024 -> 1024 | 2432 -> 2702 -> 2716 (+270, +284) GFLOPS
problemSize = 1002 | A B^T | 1024 -> 1024 -> 1024 | 2554 -> 2820 -> 2832 (+266, +278) GFLOPS
problemSize = 1003 | A B^T | 1024 -> 1024 -> 1024 | 2454 -> 2705 -> 2732 (+251, +278) GFLOPS
problemSize = 1004 | A B^T | 1024 -> 1024 -> 1024 | 2775 -> 2896 -> 2894 (+121, +119) GFLOPS
problemSize = 1005 | A B^T | 1024 -> 1024 -> 1024 | 2436 -> 2686 -> 2697 (+250, +261) GFLOPS
problemSize = 1006 | A B^T | 1024 -> 1024 -> 1024 | 2530 -> 2812 -> 2821 (+282, +291) GFLOPS
problemSize = 1007 | A B^T | 1024 -> 1024 -> 1024 | 2440 -> 2696 -> 2712 (+256, +272) GFLOPS
problemSize = 1008 | A B^T | 1024 -> 1024 -> 1024 | 2930 -> 2927 -> 2921 ( -3, -9) GFLOPS
problemSize = 1009 | A B^T | 1024 -> 1024 -> 1024 | 2445 -> 2710 -> 2723 (+265, +278) GFLOPS
problemSize = 1010 | A B^T | 1024 -> 1024 -> 1024 | 2561 -> 2837 -> 2831 (+276, +270) GFLOPS
problemSize = 1011 | A B^T | 1024 -> 1024 -> 1024 | 2471 -> 2737 -> 2752 (+266, +281) GFLOPS
problemSize = 1012 | A B^T | 1024 -> 1024 -> 1024 | 2794 -> 2928 -> 2915 (+134, +121) GFLOPS
problemSize = 1013 | A B^T | 1024 -> 1024 -> 1024 | 2440 -> 2701 -> 2719 (+261, +279) GFLOPS
problemSize = 1014 | A B^T | 1024 -> 1024 -> 1024 | 2542 -> 2846 -> 2847 (+304, +305) GFLOPS
problemSize = 1015 | A B^T | 1024 -> 1024 -> 1024 | 2431 -> 2701 -> 2713 (+270, +282) GFLOPS
problemSize = 1016 | A B^T | 1024 -> 1024 -> 1024 | 2958 -> 2932 -> 2951 ( -26, -7) GFLOPS
problemSize = 1017 | A B^T | 1024 -> 1024 -> 1024 | 2422 -> 2694 -> 2697 (+272, +275) GFLOPS
problemSize = 1018 | A B^T | 1024 -> 1024 -> 1024 | 2552 -> 2853 -> 2849 (+301, +297) GFLOPS
problemSize = 1019 | A B^T | 1024 -> 1024 -> 1024 | 2429 -> 2701 -> 2712 (+272, +283) GFLOPS
problemSize = 1020 | A B^T | 1024 -> 1024 -> 1024 | 2781 -> 2972 -> 2955 (+191, +174) GFLOPS
problemSize = 1021 | A B^T | 1024 -> 1024 -> 1024 | 2457 -> 2723 -> 2723 (+266, +266) GFLOPS
problemSize = 1022 | A B^T | 1024 -> 1024 -> 1024 | 2562 -> 2875 -> 2863 (+313, +301) GFLOPS
problemSize = 1023 | A B^T | 1024 -> 1024 -> 1024 | 2469 -> 2763 -> 2777 (+294, +308) GFLOPS
problemSize = 1024 | A B^T | 1024 -> 1024 -> 1024 | 3047 -> 3050 -> 3017 ( +3, -30) GFLOPS
problemSize = 976 | A^T B | 1024 -> 1024 -> 1024 | 2909 -> 2915 -> 2919 ( +6, +10) GFLOPS
problemSize = 977 | A^T B | 1024 -> 1024 -> 1024 | 2887 -> 2887 -> 2907 ( 0, +20) GFLOPS
problemSize = 978 | A^T B | 1024 -> 1024 -> 1024 | 2890 -> 2892 -> 2918 ( +2, +28) GFLOPS
problemSize = 979 | A^T B | 1024 -> 1024 -> 1024 | 2897 -> 2903 -> 2925 ( +6, +28) GFLOPS
problemSize = 980 | A^T B | 1024 -> 1024 -> 1024 | 2896 -> 2906 -> 2928 ( +10, +32) GFLOPS
problemSize = 981 | A^T B | 1024 -> 1024 -> 1024 | 2918 -> 2926 -> 2943 ( +8, +25) GFLOPS
problemSize = 982 | A^T B | 1024 -> 1024 -> 1024 | 2925 -> 2926 -> 2952 ( +1, +27) GFLOPS
problemSize = 983 | A^T B | 1024 -> 1024 -> 1024 | 2937 -> 2945 -> 2960 ( +8, +23) GFLOPS
problemSize = 984 | A^T B | 1024 -> 1024 -> 1024 | 2960 -> 2965 -> 2967 ( +5, +7) GFLOPS
problemSize = 985 | A^T B | 1024 -> 1024 -> 1024 | 2933 -> 2937 -> 2957 ( +4, +24) GFLOPS
problemSize = 986 | A^T B | 1024 -> 1024 -> 1024 | 2934 -> 2940 -> 2962 ( +6, +28) GFLOPS
problemSize = 987 | A^T B | 1024 -> 1024 -> 1024 | 2949 -> 2953 -> 2967 ( +4, +18) GFLOPS
problemSize = 988 | A^T B | 1024 -> 1024 -> 1024 | 2949 -> 2961 -> 2979 ( +12, +30) GFLOPS
problemSize = 989 | A^T B | 1024 -> 1024 -> 1024 | 2969 -> 2973 -> 2986 ( +4, +17) GFLOPS
problemSize = 990 | A^T B | 1024 -> 1024 -> 1024 | 2965 -> 2979 -> 2995 ( +14, +30) GFLOPS
problemSize = 991 | A^T B | 1024 -> 1024 -> 1024 | 2984 -> 2990 -> 2998 ( +6, +14) GFLOPS
problemSize = 992 | A^T B | 1024 -> 1024 -> 1024 | 3013 -> 3015 -> 2998 ( +2, -15) GFLOPS
problemSize = 993 | A^T B | 1024 -> 1024 -> 1024 | 2817 -> 2819 -> 2845 ( +2, +28) GFLOPS
problemSize = 994 | A^T B | 1024 -> 1024 -> 1024 | 2814 -> 2824 -> 2849 ( +10, +35) GFLOPS
problemSize = 995 | A^T B | 1024 -> 1024 -> 1024 | 2833 -> 2835 -> 2860 ( +2, +27) GFLOPS
problemSize = 996 | A^T B | 1024 -> 1024 -> 1024 | 2834 -> 2843 -> 2864 ( +9, +30) GFLOPS
problemSize = 997 | A^T B | 1024 -> 1024 -> 1024 | 2853 -> 2853 -> 2874 ( 0, +21) GFLOPS
problemSize = 998 | A^T B | 1024 -> 1024 -> 1024 | 2857 -> 2859 -> 2882 ( +2, +25) GFLOPS
problemSize = 999 | A^T B | 1024 -> 1024 -> 1024 | 2867 -> 2869 -> 2892 ( +2, +25) GFLOPS
problemSize = 1000 | A^T B | 1024 -> 1024 -> 1024 | 2877 -> 2878 -> 2893 ( +1, +16) GFLOPS
problemSize = 1001 | A^T B | 1024 -> 1024 -> 1024 | 2863 -> 2860 -> 2870 ( -3, +7) GFLOPS
problemSize = 1002 | A^T B | 1024 -> 1024 -> 1024 | 2869 -> 2870 -> 2876 ( +1, +7) GFLOPS
problemSize = 1003 | A^T B | 1024 -> 1024 -> 1024 | 2878 -> 2880 -> 2883 ( +2, +5) GFLOPS
problemSize = 1004 | A^T B | 1024 -> 1024 -> 1024 | 2882 -> 2883 -> 2909 ( +1, +27) GFLOPS
problemSize = 1005 | A^T B | 1024 -> 1024 -> 1024 | 2887 -> 2895 -> 2905 ( +8, +18) GFLOPS
problemSize = 1006 | A^T B | 1024 -> 1024 -> 1024 | 2902 -> 2902 -> 2906 ( 0, +4) GFLOPS
problemSize = 1007 | A^T B | 1024 -> 1024 -> 1024 | 2909 -> 2908 -> 2923 ( -1, +14) GFLOPS
problemSize = 1008 | A^T B | 1024 -> 1024 -> 1024 | 2920 -> 2923 -> 2937 ( +3, +17) GFLOPS
problemSize = 1009 | A^T B | 1024 -> 1024 -> 1024 | 2904 -> 2903 -> 2919 ( -1, +15) GFLOPS
problemSize = 1010 | A^T B | 1024 -> 1024 -> 1024 | 2912 -> 2909 -> 2937 ( -3, +25) GFLOPS
problemSize = 1011 | A^T B | 1024 -> 1024 -> 1024 | 2917 -> 2921 -> 2936 ( +4, +19) GFLOPS
problemSize = 1012 | A^T B | 1024 -> 1024 -> 1024 | 2930 -> 2933 -> 2946 ( +3, +16) GFLOPS
problemSize = 1013 | A^T B | 1024 -> 1024 -> 1024 | 2935 -> 2942 -> 2955 ( +7, +20) GFLOPS
problemSize = 1014 | A^T B | 1024 -> 1024 -> 1024 | 2944 -> 2941 -> 2967 ( -3, +23) GFLOPS
problemSize = 1015 | A^T B | 1024 -> 1024 -> 1024 | 2952 -> 2958 -> 2972 ( +6, +20) GFLOPS
problemSize = 1016 | A^T B | 1024 -> 1024 -> 1024 | 2972 -> 2971 -> 2982 ( -1, +10) GFLOPS
problemSize = 1017 | A^T B | 1024 -> 1024 -> 1024 | 2936 -> 2948 -> 2962 ( +12, +26) GFLOPS
problemSize = 1018 | A^T B | 1024 -> 1024 -> 1024 | 2949 -> 2949 -> 2974 ( 0, +25) GFLOPS
problemSize = 1019 | A^T B | 1024 -> 1024 -> 1024 | 2963 -> 2961 -> 2966 ( -2, +3) GFLOPS
problemSize = 1020 | A^T B | 1024 -> 1024 -> 1024 | 2972 -> 2973 -> 2987 ( +1, +15) GFLOPS
problemSize = 1021 | A^T B | 1024 -> 1024 -> 1024 | 2968 -> 2977 -> 2988 ( +9, +20) GFLOPS
problemSize = 1022 | A^T B | 1024 -> 1024 -> 1024 | 2984 -> 2987 -> 3000 ( +3, +16) GFLOPS
problemSize = 1023 | A^T B | 1024 -> 1024 -> 1024 | 2994 -> 2995 -> 3009 ( +1, +15) GFLOPS
problemSize = 1024 | A^T B | 1024 -> 1024 -> 1024 | 3025 -> 3015 -> 3033 ( -10, +8) GFLOPS
problemSize = 976 | A^T B^T | 1024 -> 1024 -> 1024 | 2840 -> 2840 -> 2859 ( 0, +19) GFLOPS
problemSize = 977 | A^T B^T | 1024 -> 1024 -> 1024 | 2819 -> 2849 -> 2868 ( +30, +49) GFLOPS
problemSize = 978 | A^T B^T | 1024 -> 1024 -> 1024 | 2857 -> 2859 -> 2881 ( +2, +24) GFLOPS
problemSize = 979 | A^T B^T | 1024 -> 1024 -> 1024 | 2835 -> 2866 -> 2883 ( +31, +48) GFLOPS
problemSize = 980 | A^T B^T | 1024 -> 1024 -> 1024 | 2911 -> 2917 -> 2902 ( +6, -9) GFLOPS
problemSize = 981 | A^T B^T | 1024 -> 1024 -> 1024 | 2850 -> 2886 -> 2896 ( +36, +46) GFLOPS
problemSize = 982 | A^T B^T | 1024 -> 1024 -> 1024 | 2896 -> 2899 -> 2916 ( +3, +20) GFLOPS
problemSize = 983 | A^T B^T | 1024 -> 1024 -> 1024 | 2875 -> 2901 -> 2921 ( +26, +46) GFLOPS
problemSize = 984 | A^T B^T | 1024 -> 1024 -> 1024 | 2884 -> 2881 -> 2896 ( -3, +12) GFLOPS
problemSize = 985 | A^T B^T | 1024 -> 1024 -> 1024 | 2876 -> 2899 -> 2914 ( +23, +38) GFLOPS
problemSize = 986 | A^T B^T | 1024 -> 1024 -> 1024 | 2911 -> 2910 -> 2927 ( -1, +16) GFLOPS
problemSize = 987 | A^T B^T | 1024 -> 1024 -> 1024 | 2890 -> 2915 -> 2926 ( +25, +36) GFLOPS
problemSize = 988 | A^T B^T | 1024 -> 1024 -> 1024 | 2964 -> 2965 -> 2950 ( +1, -14) GFLOPS
problemSize = 989 | A^T B^T | 1024 -> 1024 -> 1024 | 2907 -> 2932 -> 2943 ( +25, +36) GFLOPS
problemSize = 990 | A^T B^T | 1024 -> 1024 -> 1024 | 2940 -> 2945 -> 2957 ( +5, +17) GFLOPS
problemSize = 991 | A^T B^T | 1024 -> 1024 -> 1024 | 2906 -> 2946 -> 2960 ( +40, +54) GFLOPS
problemSize = 992 | A^T B^T | 1024 -> 1024 -> 1024 | 2978 -> 2979 -> 2964 ( +1, -14) GFLOPS
problemSize = 993 | A^T B^T | 1024 -> 1024 -> 1024 | 2731 -> 2773 -> 2797 ( +42, +66) GFLOPS
problemSize = 994 | A^T B^T | 1024 -> 1024 -> 1024 | 2768 -> 2789 -> 2812 ( +21, +44) GFLOPS
problemSize = 995 | A^T B^T | 1024 -> 1024 -> 1024 | 2749 -> 2790 -> 2810 ( +41, +61) GFLOPS
problemSize = 996 | A^T B^T | 1024 -> 1024 -> 1024 | 2838 -> 2845 -> 2829 ( +7, -9) GFLOPS
problemSize = 997 | A^T B^T | 1024 -> 1024 -> 1024 | 2768 -> 2806 -> 2830 ( +38, +62) GFLOPS
problemSize = 998 | A^T B^T | 1024 -> 1024 -> 1024 | 2803 -> 2820 -> 2843 ( +17, +40) GFLOPS
problemSize = 999 | A^T B^T | 1024 -> 1024 -> 1024 | 2788 -> 2827 -> 2852 ( +39, +64) GFLOPS
problemSize = 1000 | A^T B^T | 1024 -> 1024 -> 1024 | 2790 -> 2793 -> 2830 ( +3, +40) GFLOPS
problemSize = 1001 | A^T B^T | 1024 -> 1024 -> 1024 | 2783 -> 2823 -> 2847 ( +40, +64) GFLOPS
problemSize = 1002 | A^T B^T | 1024 -> 1024 -> 1024 | 2819 -> 2833 -> 2858 ( +14, +39) GFLOPS
problemSize = 1003 | A^T B^T | 1024 -> 1024 -> 1024 | 2805 -> 2835 -> 2857 ( +30, +52) GFLOPS
problemSize = 1004 | A^T B^T | 1024 -> 1024 -> 1024 | 2885 -> 2891 -> 2873 ( +6, -12) GFLOPS
problemSize = 1005 | A^T B^T | 1024 -> 1024 -> 1024 | 2816 -> 2858 -> 2871 ( +42, +55) GFLOPS
problemSize = 1006 | A^T B^T | 1024 -> 1024 -> 1024 | 2850 -> 2867 -> 2886 ( +17, +36) GFLOPS
problemSize = 1007 | A^T B^T | 1024 -> 1024 -> 1024 | 2829 -> 2872 -> 2885 ( +43, +56) GFLOPS
problemSize = 1008 | A^T B^T | 1024 -> 1024 -> 1024 | 2853 -> 2856 -> 2872 ( +3, +19) GFLOPS
problemSize = 1009 | A^T B^T | 1024 -> 1024 -> 1024 | 2827 -> 2864 -> 2881 ( +37, +54) GFLOPS
problemSize = 1010 | A^T B^T | 1024 -> 1024 -> 1024 | 2854 -> 2878 -> 2902 ( +24, +48) GFLOPS
problemSize = 1011 | A^T B^T | 1024 -> 1024 -> 1024 | 2845 -> 2881 -> 2886 ( +36, +41) GFLOPS
problemSize = 1012 | A^T B^T | 1024 -> 1024 -> 1024 | 2926 -> 2934 -> 2918 ( +8, -8) GFLOPS
problemSize = 1013 | A^T B^T | 1024 -> 1024 -> 1024 | 2847 -> 2893 -> 2906 ( +46, +59) GFLOPS
problemSize = 1014 | A^T B^T | 1024 -> 1024 -> 1024 | 2884 -> 2907 -> 2931 ( +23, +47) GFLOPS
problemSize = 1015 | A^T B^T | 1024 -> 1024 -> 1024 | 2858 -> 2913 -> 2923 ( +55, +65) GFLOPS
problemSize = 1016 | A^T B^T | 1024 -> 1024 -> 1024 | 2889 -> 2881 -> 2913 ( -8, +24) GFLOPS
problemSize = 1017 | A^T B^T | 1024 -> 1024 -> 1024 | 2856 -> 2901 -> 2919 ( +45, +63) GFLOPS
problemSize = 1018 | A^T B^T | 1024 -> 1024 -> 1024 | 2896 -> 2918 -> 2928 ( +22, +32) GFLOPS
problemSize = 1019 | A^T B^T | 1024 -> 1024 -> 1024 | 2876 -> 2927 -> 2924 ( +51, +48) GFLOPS
problemSize = 1020 | A^T B^T | 1024 -> 1024 -> 1024 | 2966 -> 2975 -> 2955 ( +9, -11) GFLOPS
problemSize = 1021 | A^T B^T | 1024 -> 1024 -> 1024 | 2889 -> 2933 -> 2940 ( +44, +51) GFLOPS
problemSize = 1022 | A^T B^T | 1024 -> 1024 -> 1024 | 2939 -> 2949 -> 2961 ( +10, +22) GFLOPS
problemSize = 1023 | A^T B^T | 1024 -> 1024 -> 1024 | 2919 -> 2963 -> 2964 ( +44, +45) GFLOPS
problemSize = 1024 | A^T B^T | 1024 -> 1024 -> 1024 | 2989 -> 2994 -> 2962 ( +5, -27) GFLOPS
FP16 (32x32x8)
problemSize = 976 | A B | 1024 -> 1024 -> 1024 | 3292 -> 3292 -> 3292 ( 0, 0) GFLOPS
problemSize = 977 | A B | 1024 -> 1024 -> 1024 | 3278 -> 3275 -> 3260 ( -3, -18) GFLOPS
problemSize = 978 | A B | 1024 -> 1024 -> 1024 | 3283 -> 3285 -> 3266 ( +2, -17) GFLOPS
problemSize = 979 | A B | 1024 -> 1024 -> 1024 | 3295 -> 3295 -> 3270 ( 0, -25) GFLOPS
problemSize = 980 | A B | 1024 -> 1024 -> 1024 | 3306 -> 3314 -> 3296 ( +8, -10) GFLOPS
problemSize = 981 | A B | 1024 -> 1024 -> 1024 | 3314 -> 3308 -> 3284 ( -6, -30) GFLOPS
problemSize = 982 | A B | 1024 -> 1024 -> 1024 | 3330 -> 3328 -> 3302 ( -2, -28) GFLOPS
problemSize = 983 | A B | 1024 -> 1024 -> 1024 | 3333 -> 3318 -> 3301 ( -15, -32) GFLOPS
problemSize = 984 | A B | 1024 -> 1024 -> 1024 | 3345 -> 3339 -> 3340 ( -6, -5) GFLOPS
problemSize = 985 | A B | 1024 -> 1024 -> 1024 | 3325 -> 3313 -> 3299 ( -12, -26) GFLOPS
problemSize = 986 | A B | 1024 -> 1024 -> 1024 | 3337 -> 3329 -> 3313 ( -8, -24) GFLOPS
problemSize = 987 | A B | 1024 -> 1024 -> 1024 | 3345 -> 3335 -> 3310 ( -10, -35) GFLOPS
problemSize = 988 | A B | 1024 -> 1024 -> 1024 | 3359 -> 3368 -> 3342 ( +9, -17) GFLOPS
problemSize = 989 | A B | 1024 -> 1024 -> 1024 | 3369 -> 3348 -> 3326 ( -21, -43) GFLOPS
problemSize = 990 | A B | 1024 -> 1024 -> 1024 | 3375 -> 3373 -> 3345 ( -2, -30) GFLOPS
problemSize = 991 | A B | 1024 -> 1024 -> 1024 | 3383 -> 3370 -> 3349 ( -13, -34) GFLOPS
problemSize = 992 | A B | 1024 -> 1024 -> 1024 | 3396 -> 3396 -> 3398 ( 0, +2) GFLOPS
problemSize = 993 | A B | 1024 -> 1024 -> 1024 | 3181 -> 3167 -> 3157 ( -14, -24) GFLOPS
problemSize = 994 | A B | 1024 -> 1024 -> 1024 | 3192 -> 3181 -> 3174 ( -11, -18) GFLOPS
problemSize = 995 | A B | 1024 -> 1024 -> 1024 | 3197 -> 3170 -> 3169 ( -27, -28) GFLOPS
problemSize = 996 | A B | 1024 -> 1024 -> 1024 | 3213 -> 3211 -> 3200 ( -2, -13) GFLOPS
problemSize = 997 | A B | 1024 -> 1024 -> 1024 | 3214 -> 3183 -> 3182 ( -31, -32) GFLOPS
problemSize = 998 | A B | 1024 -> 1024 -> 1024 | 3227 -> 3211 -> 3205 ( -16, -22) GFLOPS
problemSize = 999 | A B | 1024 -> 1024 -> 1024 | 3237 -> 3196 -> 3197 ( -41, -40) GFLOPS
problemSize = 1000 | A B | 1024 -> 1024 -> 1024 | 3247 -> 3244 -> 3246 ( -3, -1) GFLOPS
problemSize = 1001 | A B | 1024 -> 1024 -> 1024 | 3229 -> 3185 -> 3191 ( -44, -38) GFLOPS
problemSize = 1002 | A B | 1024 -> 1024 -> 1024 | 3241 -> 3219 -> 3210 ( -22, -31) GFLOPS
problemSize = 1003 | A B | 1024 -> 1024 -> 1024 | 3244 -> 3201 -> 3204 ( -43, -40) GFLOPS
problemSize = 1004 | A B | 1024 -> 1024 -> 1024 | 3267 -> 3257 -> 3243 ( -10, -24) GFLOPS
problemSize = 1005 | A B | 1024 -> 1024 -> 1024 | 3258 -> 3209 -> 3213 ( -49, -45) GFLOPS
problemSize = 1006 | A B | 1024 -> 1024 -> 1024 | 3273 -> 3251 -> 3241 ( -22, -32) GFLOPS
problemSize = 1007 | A B | 1024 -> 1024 -> 1024 | 3281 -> 3225 -> 3225 ( -56, -56) GFLOPS
problemSize = 1008 | A B | 1024 -> 1024 -> 1024 | 3298 -> 3299 -> 3297 ( +1, -1) GFLOPS
problemSize = 1009 | A B | 1024 -> 1024 -> 1024 | 3269 -> 3219 -> 3212 ( -50, -57) GFLOPS
problemSize = 1010 | A B | 1024 -> 1024 -> 1024 | 3285 -> 3253 -> 3241 ( -32, -44) GFLOPS
problemSize = 1011 | A B | 1024 -> 1024 -> 1024 | 3284 -> 3229 -> 3226 ( -55, -58) GFLOPS
problemSize = 1012 | A B | 1024 -> 1024 -> 1024 | 3316 -> 3308 -> 3288 ( -8, -28) GFLOPS
problemSize = 1013 | A B | 1024 -> 1024 -> 1024 | 3306 -> 3240 -> 3241 ( -66, -65) GFLOPS
problemSize = 1014 | A B | 1024 -> 1024 -> 1024 | 3325 -> 3279 -> 3274 ( -46, -51) GFLOPS
problemSize = 1015 | A B | 1024 -> 1024 -> 1024 | 3318 -> 3256 -> 3248 ( -62, -70) GFLOPS
problemSize = 1016 | A B | 1024 -> 1024 -> 1024 | 3355 -> 3351 -> 3354 ( -4, -1) GFLOPS
problemSize = 1017 | A B | 1024 -> 1024 -> 1024 | 3313 -> 3248 -> 3239 ( -65, -74) GFLOPS
problemSize = 1018 | A B | 1024 -> 1024 -> 1024 | 3331 -> 3284 -> 3279 ( -47, -52) GFLOPS
problemSize = 1019 | A B | 1024 -> 1024 -> 1024 | 3331 -> 3257 -> 3254 ( -74, -77) GFLOPS
problemSize = 1020 | A B | 1024 -> 1024 -> 1024 | 3366 -> 3341 -> 3332 ( -25, -34) GFLOPS
problemSize = 1021 | A B | 1024 -> 1024 -> 1024 | 3350 -> 3270 -> 3275 ( -80, -75) GFLOPS
problemSize = 1022 | A B | 1024 -> 1024 -> 1024 | 3372 -> 3313 -> 3309 ( -59, -63) GFLOPS
problemSize = 1023 | A B | 1024 -> 1024 -> 1024 | 3376 -> 3302 -> 3285 ( -74, -91) GFLOPS
problemSize = 1024 | A B | 1024 -> 1024 -> 1024 | 3411 -> 3412 -> 3404 ( +1, -7) GFLOPS
problemSize = 976 | A B^T | 1024 -> 1024 -> 1024 | 3262 -> 3264 -> 3248 ( +2, -14) GFLOPS
problemSize = 977 | A B^T | 1024 -> 1024 -> 1024 | 3219 -> 3172 -> 3182 ( -47, -37) GFLOPS
problemSize = 978 | A B^T | 1024 -> 1024 -> 1024 | 3236 -> 3202 -> 3194 ( -34, -42) GFLOPS
problemSize = 979 | A B^T | 1024 -> 1024 -> 1024 | 3232 -> 3180 -> 3172 ( -52, -60) GFLOPS
problemSize = 980 | A B^T | 1024 -> 1024 -> 1024 | 3270 -> 3226 -> 3230 ( -44, -40) GFLOPS
problemSize = 981 | A B^T | 1024 -> 1024 -> 1024 | 3238 -> 3177 -> 3179 ( -61, -59) GFLOPS
problemSize = 982 | A B^T | 1024 -> 1024 -> 1024 | 3269 -> 3208 -> 3203 ( -61, -66) GFLOPS
problemSize = 983 | A B^T | 1024 -> 1024 -> 1024 | 3270 -> 3194 -> 3189 ( -76, -81) GFLOPS
problemSize = 984 | A B^T | 1024 -> 1024 -> 1024 | 3304 -> 3318 -> 3291 ( +14, -13) GFLOPS
problemSize = 985 | A B^T | 1024 -> 1024 -> 1024 | 3265 -> 3184 -> 3182 ( -81, -83) GFLOPS
problemSize = 986 | A B^T | 1024 -> 1024 -> 1024 | 3284 -> 3215 -> 3202 ( -69, -82) GFLOPS
problemSize = 987 | A B^T | 1024 -> 1024 -> 1024 | 3259 -> 3197 -> 3192 ( -62, -67) GFLOPS
problemSize = 988 | A B^T | 1024 -> 1024 -> 1024 | 3320 -> 3252 -> 3252 ( -68, -68) GFLOPS
problemSize = 989 | A B^T | 1024 -> 1024 -> 1024 | 3272 -> 3215 -> 3198 ( -57, -74) GFLOPS
problemSize = 990 | A B^T | 1024 -> 1024 -> 1024 | 3321 -> 3243 -> 3235 ( -78, -86) GFLOPS
problemSize = 991 | A B^T | 1024 -> 1024 -> 1024 | 3317 -> 3242 -> 3225 ( -75, -92) GFLOPS
problemSize = 992 | A B^T | 1024 -> 1024 -> 1024 | 3387 -> 3390 -> 3370 ( +3, -17) GFLOPS
problemSize = 993 | A B^T | 1024 -> 1024 -> 1024 | 3084 -> 3028 -> 3041 ( -56, -43) GFLOPS
problemSize = 994 | A B^T | 1024 -> 1024 -> 1024 | 3116 -> 3048 -> 3055 ( -68, -61) GFLOPS
problemSize = 995 | A B^T | 1024 -> 1024 -> 1024 | 3072 -> 3041 -> 3040 ( -31, -32) GFLOPS
problemSize = 996 | A B^T | 1024 -> 1024 -> 1024 | 3157 -> 3085 -> 3090 ( -72, -67) GFLOPS
problemSize = 997 | A B^T | 1024 -> 1024 -> 1024 | 3088 -> 3050 -> 3050 ( -38, -38) GFLOPS
problemSize = 998 | A B^T | 1024 -> 1024 -> 1024 | 3136 -> 3069 -> 3071 ( -67, -65) GFLOPS
problemSize = 999 | A B^T | 1024 -> 1024 -> 1024 | 3113 -> 3067 -> 3067 ( -46, -46) GFLOPS
problemSize = 1000 | A B^T | 1024 -> 1024 -> 1024 | 3209 -> 3212 -> 3191 ( +3, -18) GFLOPS
problemSize = 1001 | A B^T | 1024 -> 1024 -> 1024 | 3112 -> 3061 -> 3055 ( -51, -57) GFLOPS
problemSize = 1002 | A B^T | 1024 -> 1024 -> 1024 | 3145 -> 3082 -> 3075 ( -63, -70) GFLOPS
problemSize = 1003 | A B^T | 1024 -> 1024 -> 1024 | 3119 -> 3078 -> 3066 ( -41, -53) GFLOPS
problemSize = 1004 | A B^T | 1024 -> 1024 -> 1024 | 3194 -> 3119 -> 3114 ( -75, -80) GFLOPS
problemSize = 1005 | A B^T | 1024 -> 1024 -> 1024 | 3122 -> 3091 -> 3087 ( -31, -35) GFLOPS
problemSize = 1006 | A B^T | 1024 -> 1024 -> 1024 | 3170 -> 3117 -> 3100 ( -53, -70) GFLOPS
problemSize = 1007 | A B^T | 1024 -> 1024 -> 1024 | 3140 -> 3107 -> 3102 ( -33, -38) GFLOPS
problemSize = 1008 | A B^T | 1024 -> 1024 -> 1024 | 3274 -> 3273 -> 3237 ( -1, -37) GFLOPS
problemSize = 1009 | A B^T | 1024 -> 1024 -> 1024 | 3133 -> 3103 -> 3100 ( -30, -33) GFLOPS
problemSize = 1010 | A B^T | 1024 -> 1024 -> 1024 | 3175 -> 3120 -> 3117 ( -55, -58) GFLOPS
problemSize = 1011 | A B^T | 1024 -> 1024 -> 1024 | 3136 -> 3117 -> 3109 ( -19, -27) GFLOPS
problemSize = 1012 | A B^T | 1024 -> 1024 -> 1024 | 3228 -> 3154 -> 3153 ( -74, -75) GFLOPS
problemSize = 1013 | A B^T | 1024 -> 1024 -> 1024 | 3139 -> 3130 -> 3124 ( -9, -15) GFLOPS
problemSize = 1014 | A B^T | 1024 -> 1024 -> 1024 | 3184 -> 3151 -> 3144 ( -33, -40) GFLOPS
problemSize = 1015 | A B^T | 1024 -> 1024 -> 1024 | 3140 -> 3149 -> 3132 ( +9, -8) GFLOPS
problemSize = 1016 | A B^T | 1024 -> 1024 -> 1024 | 3311 -> 3313 -> 3252 ( +2, -59) GFLOPS
problemSize = 1017 | A B^T | 1024 -> 1024 -> 1024 | 3135 -> 3136 -> 3130 ( +1, -5) GFLOPS
problemSize = 1018 | A B^T | 1024 -> 1024 -> 1024 | 3194 -> 3161 -> 3152 ( -33, -42) GFLOPS
problemSize = 1019 | A B^T | 1024 -> 1024 -> 1024 | 3159 -> 3142 -> 3140 ( -17, -19) GFLOPS
problemSize = 1020 | A B^T | 1024 -> 1024 -> 1024 | 3020 -> 2935 -> 3109 ( -85, +89) GFLOPS
problemSize = 1021 | A B^T | 1024 -> 1024 -> 1024 | 2769 -> 2690 -> 2856 ( -79, +87) GFLOPS
problemSize = 1022 | A B^T | 1024 -> 1024 -> 1024 | 2625 -> 2597 -> 2645 ( -28, +20) GFLOPS
problemSize = 1023 | A B^T | 1024 -> 1024 -> 1024 | 2614 -> 2671 -> 2621 ( +57, +7) GFLOPS
problemSize = 1024 | A B^T | 1024 -> 1024 -> 1024 | 2711 -> 2729 -> 2666 ( +18, -45) GFLOPS
problemSize = 976 | A^T B | 1024 -> 1024 -> 1024 | 3278 -> 3284 -> 3277 ( +6, -1) GFLOPS
problemSize = 977 | A^T B | 1024 -> 1024 -> 1024 | 3239 -> 3238 -> 3234 ( -1, -5) GFLOPS
problemSize = 978 | A^T B | 1024 -> 1024 -> 1024 | 3254 -> 3253 -> 3247 ( -1, -7) GFLOPS
problemSize = 979 | A^T B | 1024 -> 1024 -> 1024 | 3261 -> 3251 -> 3248 ( -10, -13) GFLOPS
problemSize = 980 | A^T B | 1024 -> 1024 -> 1024 | 3283 -> 3284 -> 3266 ( +1, -17) GFLOPS
problemSize = 981 | A^T B | 1024 -> 1024 -> 1024 | 3280 -> 3277 -> 3255 ( -3, -25) GFLOPS
problemSize = 982 | A^T B | 1024 -> 1024 -> 1024 | 3286 -> 3283 -> 3273 ( -3, -13) GFLOPS
problemSize = 983 | A^T B | 1024 -> 1024 -> 1024 | 3287 -> 3292 -> 3269 ( +5, -18) GFLOPS
problemSize = 984 | A^T B | 1024 -> 1024 -> 1024 | 3333 -> 3332 -> 3328 ( -1, -5) GFLOPS
problemSize = 985 | A^T B | 1024 -> 1024 -> 1024 | 3287 -> 3286 -> 3265 ( -1, -22) GFLOPS
problemSize = 986 | A^T B | 1024 -> 1024 -> 1024 | 3302 -> 3299 -> 3281 ( -3, -21) GFLOPS
problemSize = 987 | A^T B | 1024 -> 1024 -> 1024 | 3312 -> 3302 -> 3277 ( -10, -35) GFLOPS
problemSize = 988 | A^T B | 1024 -> 1024 -> 1024 | 3340 -> 3322 -> 3312 ( -18, -28) GFLOPS
problemSize = 989 | A^T B | 1024 -> 1024 -> 1024 | 3327 -> 3313 -> 3287 ( -14, -40) GFLOPS
problemSize = 990 | A^T B | 1024 -> 1024 -> 1024 | 3341 -> 3334 -> 3309 ( -7, -32) GFLOPS
problemSize = 991 | A^T B | 1024 -> 1024 -> 1024 | 3348 -> 3331 -> 3311 ( -17, -37) GFLOPS
problemSize = 992 | A^T B | 1024 -> 1024 -> 1024 | 3367 -> 3364 -> 3381 ( -3, +14) GFLOPS
problemSize = 993 | A^T B | 1024 -> 1024 -> 1024 | 3155 -> 3132 -> 3126 ( -23, -29) GFLOPS
problemSize = 994 | A^T B | 1024 -> 1024 -> 1024 | 3163 -> 3152 -> 3144 ( -11, -19) GFLOPS
problemSize = 995 | A^T B | 1024 -> 1024 -> 1024 | 3167 -> 3145 -> 3136 ( -22, -31) GFLOPS
problemSize = 996 | A^T B | 1024 -> 1024 -> 1024 | 3192 -> 3185 -> 3174 ( -7, -18) GFLOPS
problemSize = 997 | A^T B | 1024 -> 1024 -> 1024 | 3185 -> 3159 -> 3149 ( -26, -36) GFLOPS
problemSize = 998 | A^T B | 1024 -> 1024 -> 1024 | 3199 -> 3181 -> 3173 ( -18, -26) GFLOPS
problemSize = 999 | A^T B | 1024 -> 1024 -> 1024 | 3198 -> 3174 -> 3169 ( -24, -29) GFLOPS
problemSize = 1000 | A^T B | 1024 -> 1024 -> 1024 | 3241 -> 3239 -> 3237 ( -2, -4) GFLOPS
problemSize = 1001 | A^T B | 1024 -> 1024 -> 1024 | 3198 -> 3173 -> 3159 ( -25, -39) GFLOPS
problemSize = 1002 | A^T B | 1024 -> 1024 -> 1024 | 3215 -> 3191 -> 3183 ( -24, -32) GFLOPS
problemSize = 1003 | A^T B | 1024 -> 1024 -> 1024 | 3210 -> 3186 -> 3173 ( -24, -37) GFLOPS
problemSize = 1004 | A^T B | 1024 -> 1024 -> 1024 | 3241 -> 3231 -> 3217 ( -10, -24) GFLOPS
problemSize = 1005 | A^T B | 1024 -> 1024 -> 1024 | 3226 -> 3194 -> 3179 ( -32, -47) GFLOPS
problemSize = 1006 | A^T B | 1024 -> 1024 -> 1024 | 3252 -> 3228 -> 3212 ( -24, -40) GFLOPS
problemSize = 1007 | A^T B | 1024 -> 1024 -> 1024 | 3249 -> 3221 -> 3203 ( -28, -46) GFLOPS
problemSize = 1008 | A^T B | 1024 -> 1024 -> 1024 | 3296 -> 3296 -> 3290 ( 0, -6) GFLOPS
problemSize = 1009 | A^T B | 1024 -> 1024 -> 1024 | 3244 -> 3208 -> 3197 ( -36, -47) GFLOPS
problemSize = 1010 | A^T B | 1024 -> 1024 -> 1024 | 3256 -> 3233 -> 3220 ( -23, -36) GFLOPS
problemSize = 1011 | A^T B | 1024 -> 1024 -> 1024 | 3259 -> 3214 -> 3205 ( -45, -54) GFLOPS
problemSize = 1012 | A^T B | 1024 -> 1024 -> 1024 | 3294 -> 3276 -> 3259 ( -18, -35) GFLOPS
problemSize = 1013 | A^T B | 1024 -> 1024 -> 1024 | 3278 -> 3231 -> 3216 ( -47, -62) GFLOPS
problemSize = 1014 | A^T B | 1024 -> 1024 -> 1024 | 3296 -> 3265 -> 3253 ( -31, -43) GFLOPS
problemSize = 1015 | A^T B | 1024 -> 1024 -> 1024 | 3293 -> 3248 -> 3222 ( -45, -71) GFLOPS
problemSize = 1016 | A^T B | 1024 -> 1024 -> 1024 | 3343 -> 3342 -> 3339 ( -1, -4) GFLOPS
problemSize = 1017 | A^T B | 1024 -> 1024 -> 1024 | 3288 -> 3234 -> 3229 ( -54, -59) GFLOPS
problemSize = 1018 | A^T B | 1024 -> 1024 -> 1024 | 3306 -> 3273 -> 3251 ( -33, -55) GFLOPS
problemSize = 1019 | A^T B | 1024 -> 1024 -> 1024 | 3308 -> 3259 -> 3243 ( -49, -65) GFLOPS
problemSize = 1020 | A^T B | 1024 -> 1024 -> 1024 | 3340 -> 3320 -> 3309 ( -20, -31) GFLOPS
problemSize = 1021 | A^T B | 1024 -> 1024 -> 1024 | 3327 -> 3275 -> 3247 ( -52, -80) GFLOPS
problemSize = 1022 | A^T B | 1024 -> 1024 -> 1024 | 3339 -> 3310 -> 3271 ( -29, -68) GFLOPS
problemSize = 1023 | A^T B | 1024 -> 1024 -> 1024 | 3342 -> 3288 -> 3274 ( -54, -68) GFLOPS
problemSize = 1024 | A^T B | 1024 -> 1024 -> 1024 | 3376 -> 3378 -> 3405 ( +2, +29) GFLOPS
problemSize = 976 | A^T B^T | 1024 -> 1024 -> 1024 | 3235 -> 3240 -> 3241 ( +5, +6) GFLOPS
problemSize = 977 | A^T B^T | 1024 -> 1024 -> 1024 | 3211 -> 3190 -> 3185 ( -21, -26) GFLOPS
problemSize = 978 | A^T B^T | 1024 -> 1024 -> 1024 | 3222 -> 3213 -> 3187 ( -9, -35) GFLOPS
problemSize = 979 | A^T B^T | 1024 -> 1024 -> 1024 | 3226 -> 3200 -> 3185 ( -26, -41) GFLOPS
problemSize = 980 | A^T B^T | 1024 -> 1024 -> 1024 | 3250 -> 3240 -> 3219 ( -10, -31) GFLOPS
problemSize = 981 | A^T B^T | 1024 -> 1024 -> 1024 | 3247 -> 3207 -> 3188 ( -40, -59) GFLOPS
problemSize = 982 | A^T B^T | 1024 -> 1024 -> 1024 | 3255 -> 3223 -> 3204 ( -32, -51) GFLOPS
problemSize = 983 | A^T B^T | 1024 -> 1024 -> 1024 | 3263 -> 3214 -> 3210 ( -49, -53) GFLOPS
problemSize = 984 | A^T B^T | 1024 -> 1024 -> 1024 | 3286 -> 3291 -> 3289 ( +5, +3) GFLOPS
problemSize = 985 | A^T B^T | 1024 -> 1024 -> 1024 | 3259 -> 3205 -> 3190 ( -54, -69) GFLOPS
problemSize = 986 | A^T B^T | 1024 -> 1024 -> 1024 | 3268 -> 3224 -> 3211 ( -44, -57) GFLOPS
problemSize = 987 | A^T B^T | 1024 -> 1024 -> 1024 | 3270 -> 3220 -> 3202 ( -50, -68) GFLOPS
problemSize = 988 | A^T B^T | 1024 -> 1024 -> 1024 | 3301 -> 3266 -> 3256 ( -35, -45) GFLOPS
problemSize = 989 | A^T B^T | 1024 -> 1024 -> 1024 | 3292 -> 3235 -> 3209 ( -57, -83) GFLOPS
problemSize = 990 | A^T B^T | 1024 -> 1024 -> 1024 | 3308 -> 3265 -> 3252 ( -43, -56) GFLOPS
problemSize = 991 | A^T B^T | 1024 -> 1024 -> 1024 | 3312 -> 3263 -> 3237 ( -49, -75) GFLOPS
problemSize = 992 | A^T B^T | 1024 -> 1024 -> 1024 | 3326 -> 3324 -> 3339 ( -2, +13) GFLOPS
problemSize = 993 | A^T B^T | 1024 -> 1024 -> 1024 | 3112 -> 3059 -> 3055 ( -53, -57) GFLOPS
problemSize = 994 | A^T B^T | 1024 -> 1024 -> 1024 | 3121 -> 3073 -> 3070 ( -48, -51) GFLOPS
problemSize = 995 | A^T B^T | 1024 -> 1024 -> 1024 | 3120 -> 3058 -> 3049 ( -62, -71) GFLOPS
problemSize = 996 | A^T B^T | 1024 -> 1024 -> 1024 | 3145 -> 3106 -> 3107 ( -39, -38) GFLOPS
problemSize = 997 | A^T B^T | 1024 -> 1024 -> 1024 | 3138 -> 3077 -> 3072 ( -61, -66) GFLOPS
problemSize = 998 | A^T B^T | 1024 -> 1024 -> 1024 | 3152 -> 3091 -> 3085 ( -61, -67) GFLOPS
problemSize = 999 | A^T B^T | 1024 -> 1024 -> 1024 | 3155 -> 3088 -> 3083 ( -67, -72) GFLOPS
problemSize = 1000 | A^T B^T | 1024 -> 1024 -> 1024 | 3194 -> 3195 -> 3186 ( +1, -8) GFLOPS
problemSize = 1001 | A^T B^T | 1024 -> 1024 -> 1024 | 3149 -> 3081 -> 3072 ( -68, -77) GFLOPS
problemSize = 1002 | A^T B^T | 1024 -> 1024 -> 1024 | 3167 -> 3100 -> 3093 ( -67, -74) GFLOPS
problemSize = 1003 | A^T B^T | 1024 -> 1024 -> 1024 | 3164 -> 3089 -> 3086 ( -75, -78) GFLOPS
problemSize = 1004 | A^T B^T | 1024 -> 1024 -> 1024 | 3195 -> 3143 -> 3128 ( -52, -67) GFLOPS
problemSize = 1005 | A^T B^T | 1024 -> 1024 -> 1024 | 3181 -> 3097 -> 3095 ( -84, -86) GFLOPS
problemSize = 1006 | A^T B^T | 1024 -> 1024 -> 1024 | 3207 -> 3134 -> 3120 ( -73, -87) GFLOPS
problemSize = 1007 | A^T B^T | 1024 -> 1024 -> 1024 | 3206 -> 3129 -> 3121 ( -77, -85) GFLOPS
problemSize = 1008 | A^T B^T | 1024 -> 1024 -> 1024 | 3244 -> 3245 -> 3241 ( +1, -3) GFLOPS
problemSize = 1009 | A^T B^T | 1024 -> 1024 -> 1024 | 3200 -> 3126 -> 3114 ( -74, -86) GFLOPS
problemSize = 1010 | A^T B^T | 1024 -> 1024 -> 1024 | 3214 -> 3138 -> 3132 ( -76, -82) GFLOPS
problemSize = 1011 | A^T B^T | 1024 -> 1024 -> 1024 | 3206 -> 3125 -> 3116 ( -81, -90) GFLOPS
problemSize = 1012 | A^T B^T | 1024 -> 1024 -> 1024 | 3241 -> 3179 -> 3175 ( -62, -66) GFLOPS
problemSize = 1013 | A^T B^T | 1024 -> 1024 -> 1024 | 3228 -> 3147 -> 3136 ( -81, -92) GFLOPS
problemSize = 1014 | A^T B^T | 1024 -> 1024 -> 1024 | 3241 -> 3168 -> 3153 ( -73, -88) GFLOPS
problemSize = 1015 | A^T B^T | 1024 -> 1024 -> 1024 | 3241 -> 3164 -> 3148 ( -77, -93) GFLOPS
problemSize = 1016 | A^T B^T | 1024 -> 1024 -> 1024 | 3297 -> 3300 -> 3282 ( +3, -15) GFLOPS
problemSize = 1017 | A^T B^T | 1024 -> 1024 -> 1024 | 3234 -> 3158 -> 3140 ( -76, -94) GFLOPS
problemSize = 1018 | A^T B^T | 1024 -> 1024 -> 1024 | 3256 -> 3173 -> 3157 ( -83, -99) GFLOPS
problemSize = 1019 | A^T B^T | 1024 -> 1024 -> 1024 | 3252 -> 3170 -> 3153 ( -82, -99) GFLOPS
problemSize = 1020 | A^T B^T | 1024 -> 1024 -> 1024 | 3282 -> 3218 -> 3210 ( -64, -72) GFLOPS
problemSize = 1021 | A^T B^T | 1024 -> 1024 -> 1024 | 3268 -> 3184 -> 3171 ( -84, -97) GFLOPS
problemSize = 1022 | A^T B^T | 1024 -> 1024 -> 1024 | 3292 -> 3214 -> 3196 ( -78, -96) GFLOPS
problemSize = 1023 | A^T B^T | 1024 -> 1024 -> 1024 | 3298 -> 3212 -> 3199 ( -86, -99) GFLOPS
problemSize = 1024 | A^T B^T | 1024 -> 1024 -> 1024 | 3332 -> 3330 -> 3348 ( -2, +16) GFLOPS
BF16 (32x32x8)
problemSize = 976 | A B | 1024 -> 1024 -> 1024 | 3224 -> 3223 -> 3207 ( -1, -17) GFLOPS
problemSize = 977 | A B | 1024 -> 1024 -> 1024 | 3211 -> 3216 -> 3184 ( +5, -27) GFLOPS
problemSize = 978 | A B | 1024 -> 1024 -> 1024 | 3218 -> 3222 -> 3196 ( +4, -22) GFLOPS
problemSize = 979 | A B | 1024 -> 1024 -> 1024 | 3232 -> 3234 -> 3204 ( +2, -28) GFLOPS
problemSize = 980 | A B | 1024 -> 1024 -> 1024 | 3248 -> 3254 -> 3209 ( +6, -39) GFLOPS
problemSize = 981 | A B | 1024 -> 1024 -> 1024 | 3253 -> 3260 -> 3220 ( +7, -33) GFLOPS
problemSize = 982 | A B | 1024 -> 1024 -> 1024 | 3260 -> 3268 -> 3229 ( +8, -31) GFLOPS
problemSize = 983 | A B | 1024 -> 1024 -> 1024 | 3266 -> 3272 -> 3234 ( +6, -32) GFLOPS
problemSize = 984 | A B | 1024 -> 1024 -> 1024 | 3273 -> 3272 -> 3257 ( -1, -16) GFLOPS
problemSize = 985 | A B | 1024 -> 1024 -> 1024 | 3262 -> 3271 -> 3228 ( +9, -34) GFLOPS
problemSize = 986 | A B | 1024 -> 1024 -> 1024 | 3273 -> 3279 -> 3239 ( +6, -34) GFLOPS
problemSize = 987 | A B | 1024 -> 1024 -> 1024 | 3284 -> 3289 -> 3254 ( +5, -30) GFLOPS
problemSize = 988 | A B | 1024 -> 1024 -> 1024 | 3299 -> 3302 -> 3266 ( +3, -33) GFLOPS
problemSize = 989 | A B | 1024 -> 1024 -> 1024 | 3305 -> 3310 -> 3258 ( +5, -47) GFLOPS
problemSize = 990 | A B | 1024 -> 1024 -> 1024 | 3311 -> 3318 -> 3276 ( +7, -35) GFLOPS
problemSize = 991 | A B | 1024 -> 1024 -> 1024 | 3322 -> 3326 -> 3284 ( +4, -38) GFLOPS
problemSize = 992 | A B | 1024 -> 1024 -> 1024 | 3318 -> 3318 -> 3321 ( 0, +3) GFLOPS
problemSize = 993 | A B | 1024 -> 1024 -> 1024 | 3127 -> 3131 -> 3102 ( +4, -25) GFLOPS
problemSize = 994 | A B | 1024 -> 1024 -> 1024 | 3140 -> 3136 -> 3111 ( -4, -29) GFLOPS
problemSize = 995 | A B | 1024 -> 1024 -> 1024 | 3148 -> 3151 -> 3120 ( +3, -28) GFLOPS
problemSize = 996 | A B | 1024 -> 1024 -> 1024 | 3158 -> 3157 -> 3123 ( -1, -35) GFLOPS
problemSize = 997 | A B | 1024 -> 1024 -> 1024 | 3168 -> 3164 -> 3140 ( -4, -28) GFLOPS
problemSize = 998 | A B | 1024 -> 1024 -> 1024 | 3178 -> 3171 -> 3146 ( -7, -32) GFLOPS
problemSize = 999 | A B | 1024 -> 1024 -> 1024 | 3188 -> 3183 -> 3158 ( -5, -30) GFLOPS
problemSize = 1000 | A B | 1024 -> 1024 -> 1024 | 3178 -> 3182 -> 3171 ( +4, -7) GFLOPS
problemSize = 1001 | A B | 1024 -> 1024 -> 1024 | 3180 -> 3179 -> 3148 ( -1, -32) GFLOPS
problemSize = 1002 | A B | 1024 -> 1024 -> 1024 | 3192 -> 3188 -> 3152 ( -4, -40) GFLOPS
problemSize = 1003 | A B | 1024 -> 1024 -> 1024 | 3201 -> 3201 -> 3166 ( 0, -35) GFLOPS
problemSize = 1004 | A B | 1024 -> 1024 -> 1024 | 3210 -> 3212 -> 3172 ( +2, -38) GFLOPS
problemSize = 1005 | A B | 1024 -> 1024 -> 1024 | 3221 -> 3223 -> 3185 ( +2, -36) GFLOPS
problemSize = 1006 | A B | 1024 -> 1024 -> 1024 | 3229 -> 3227 -> 3189 ( -2, -40) GFLOPS
problemSize = 1007 | A B | 1024 -> 1024 -> 1024 | 3238 -> 3235 -> 3200 ( -3, -38) GFLOPS
problemSize = 1008 | A B | 1024 -> 1024 -> 1024 | 3229 -> 3231 -> 3215 ( +2, -14) GFLOPS
problemSize = 1009 | A B | 1024 -> 1024 -> 1024 | 3233 -> 3235 -> 3202 ( +2, -31) GFLOPS
problemSize = 1010 | A B | 1024 -> 1024 -> 1024 | 3241 -> 3241 -> 3208 ( 0, -33) GFLOPS
problemSize = 1011 | A B | 1024 -> 1024 -> 1024 | 3254 -> 3253 -> 3223 ( -1, -31) GFLOPS
problemSize = 1012 | A B | 1024 -> 1024 -> 1024 | 3260 -> 3265 -> 3227 ( +5, -33) GFLOPS
problemSize = 1013 | A B | 1024 -> 1024 -> 1024 | 3268 -> 3274 -> 3239 ( +6, -29) GFLOPS
problemSize = 1014 | A B | 1024 -> 1024 -> 1024 | 3284 -> 3279 -> 3246 ( -5, -38) GFLOPS
problemSize = 1015 | A B | 1024 -> 1024 -> 1024 | 3292 -> 3291 -> 3264 ( -1, -28) GFLOPS
problemSize = 1016 | A B | 1024 -> 1024 -> 1024 | 3279 -> 3282 -> 3270 ( +3, -9) GFLOPS
problemSize = 1017 | A B | 1024 -> 1024 -> 1024 | 3286 -> 3285 -> 3255 ( -1, -31) GFLOPS
problemSize = 1018 | A B | 1024 -> 1024 -> 1024 | 3295 -> 3295 -> 3259 ( 0, -36) GFLOPS
problemSize = 1019 | A B | 1024 -> 1024 -> 1024 | 3307 -> 3305 -> 3271 ( -2, -36) GFLOPS
problemSize = 1020 | A B | 1024 -> 1024 -> 1024 | 3316 -> 3317 -> 3276 ( +1, -40) GFLOPS
problemSize = 1021 | A B | 1024 -> 1024 -> 1024 | 3324 -> 3324 -> 3285 ( 0, -39) GFLOPS
problemSize = 1022 | A B | 1024 -> 1024 -> 1024 | 3332 -> 3337 -> 3295 ( +5, -37) GFLOPS
problemSize = 1023 | A B | 1024 -> 1024 -> 1024 | 3344 -> 3344 -> 3308 ( 0, -36) GFLOPS
problemSize = 1024 | A B | 1024 -> 1024 -> 1024 | 3332 -> 3312 -> 3343 ( -20, +11) GFLOPS
problemSize = 976 | A B^T | 1024 -> 1024 -> 1024 | 3209 -> 3212 -> 3179 ( +3, -30) GFLOPS
problemSize = 977 | A B^T | 1024 -> 1024 -> 1024 | 3159 -> 3176 -> 3125 ( +17, -34) GFLOPS
problemSize = 978 | A B^T | 1024 -> 1024 -> 1024 | 3174 -> 3194 -> 3138 ( +20, -36) GFLOPS
problemSize = 979 | A B^T | 1024 -> 1024 -> 1024 | 3171 -> 3200 -> 3148 ( +29, -23) GFLOPS
problemSize = 980 | A B^T | 1024 -> 1024 -> 1024 | 3205 -> 3210 -> 3161 ( +5, -44) GFLOPS
problemSize = 981 | A B^T | 1024 -> 1024 -> 1024 | 3188 -> 3215 -> 3162 ( +27, -26) GFLOPS
problemSize = 982 | A B^T | 1024 -> 1024 -> 1024 | 3214 -> 3236 -> 3174 ( +22, -40) GFLOPS
problemSize = 983 | A B^T | 1024 -> 1024 -> 1024 | 3215 -> 3239 -> 3179 ( +24, -36) GFLOPS
problemSize = 984 | A B^T | 1024 -> 1024 -> 1024 | 3255 -> 3258 -> 3217 ( +3, -38) GFLOPS
problemSize = 985 | A B^T | 1024 -> 1024 -> 1024 | 3214 -> 3226 -> 3177 ( +12, -37) GFLOPS
problemSize = 986 | A B^T | 1024 -> 1024 -> 1024 | 3227 -> 3245 -> 3183 ( +18, -44) GFLOPS
problemSize = 987 | A B^T | 1024 -> 1024 -> 1024 | 3231 -> 3246 -> 3192 ( +15, -39) GFLOPS
problemSize = 988 | A B^T | 1024 -> 1024 -> 1024 | 3262 -> 3260 -> 3208 ( -2, -54) GFLOPS
problemSize = 989 | A B^T | 1024 -> 1024 -> 1024 | 3250 -> 3268 -> 3214 ( +18, -36) GFLOPS
problemSize = 990 | A B^T | 1024 -> 1024 -> 1024 | 3268 -> 3278 -> 3222 ( +10, -46) GFLOPS
problemSize = 991 | A B^T | 1024 -> 1024 -> 1024 | 3261 -> 3286 -> 3226 ( +25, -35) GFLOPS
problemSize = 992 | A B^T | 1024 -> 1024 -> 1024 | 3302 -> 3302 -> 3288 ( 0, -14) GFLOPS
problemSize = 993 | A B^T | 1024 -> 1024 -> 1024 | 3057 -> 3087 -> 3044 ( +30, -13) GFLOPS
problemSize = 994 | A B^T | 1024 -> 1024 -> 1024 | 3081 -> 3098 -> 3055 ( +17, -26) GFLOPS
problemSize = 995 | A B^T | 1024 -> 1024 -> 1024 | 3071 -> 3106 -> 3060 ( +35, -11) GFLOPS
problemSize = 996 | A B^T | 1024 -> 1024 -> 1024 | 3110 -> 3120 -> 3077 ( +10, -33) GFLOPS
problemSize = 997 | A B^T | 1024 -> 1024 -> 1024 | 3091 -> 3123 -> 3079 ( +32, -12) GFLOPS
problemSize = 998 | A B^T | 1024 -> 1024 -> 1024 | 3120 -> 3135 -> 3091 ( +15, -29) GFLOPS
problemSize = 999 | A B^T | 1024 -> 1024 -> 1024 | 3125 -> 3144 -> 3100 ( +19, -25) GFLOPS
problemSize = 1000 | A B^T | 1024 -> 1024 -> 1024 | 3158 -> 3157 -> 3129 ( -1, -29) GFLOPS
problemSize = 1001 | A B^T | 1024 -> 1024 -> 1024 | 3116 -> 3139 -> 3094 ( +23, -22) GFLOPS
problemSize = 1002 | A B^T | 1024 -> 1024 -> 1024 | 3137 -> 3148 -> 3101 ( +11, -36) GFLOPS
problemSize = 1003 | A B^T | 1024 -> 1024 -> 1024 | 3136 -> 3156 -> 3108 ( +20, -28) GFLOPS
problemSize = 1004 | A B^T | 1024 -> 1024 -> 1024 | 3167 -> 3172 -> 3135 ( +5, -32) GFLOPS
problemSize = 1005 | A B^T | 1024 -> 1024 -> 1024 | 3152 -> 3175 -> 3128 ( +23, -24) GFLOPS
problemSize = 1006 | A B^T | 1024 -> 1024 -> 1024 | 3174 -> 3184 -> 3139 ( +10, -35) GFLOPS
problemSize = 1007 | A B^T | 1024 -> 1024 -> 1024 | 3169 -> 3192 -> 3145 ( +23, -24) GFLOPS
problemSize = 1008 | A B^T | 1024 -> 1024 -> 1024 | 3223 -> 3221 -> 3184 ( -2, -39) GFLOPS
problemSize = 1009 | A B^T | 1024 -> 1024 -> 1024 | 3162 -> 3189 -> 3142 ( +27, -20) GFLOPS
problemSize = 1010 | A B^T | 1024 -> 1024 -> 1024 | 3187 -> 3200 -> 3152 ( +13, -35) GFLOPS
problemSize = 1011 | A B^T | 1024 -> 1024 -> 1024 | 3184 -> 3206 -> 3160 ( +22, -24) GFLOPS
problemSize = 1012 | A B^T | 1024 -> 1024 -> 1024 | 3213 -> 3222 -> 3173 ( +9, -40) GFLOPS
problemSize = 1013 | A B^T | 1024 -> 1024 -> 1024 | 3185 -> 3228 -> 3178 ( +43, -7) GFLOPS
problemSize = 1014 | A B^T | 1024 -> 1024 -> 1024 | 3213 -> 3236 -> 3190 ( +23, -23) GFLOPS
problemSize = 1015 | A B^T | 1024 -> 1024 -> 1024 | 3189 -> 3238 -> 3193 ( +49, +4) GFLOPS
problemSize = 1016 | A B^T | 1024 -> 1024 -> 1024 | 3261 -> 3260 -> 3229 ( -1, -32) GFLOPS
problemSize = 1017 | A B^T | 1024 -> 1024 -> 1024 | 3181 -> 3231 -> 3186 ( +50, +5) GFLOPS
problemSize = 1018 | A B^T | 1024 -> 1024 -> 1024 | 3223 -> 3252 -> 3201 ( +29, -22) GFLOPS
problemSize = 1019 | A B^T | 1024 -> 1024 -> 1024 | 3208 -> 3253 -> 3202 ( +45, -6) GFLOPS
problemSize = 1020 | A B^T | 1024 -> 1024 -> 1024 | 3263 -> 3269 -> 3222 ( +6, -41) GFLOPS
problemSize = 1021 | A B^T | 1024 -> 1024 -> 1024 | 3232 -> 3271 -> 3221 ( +39, -11) GFLOPS
problemSize = 1022 | A B^T | 1024 -> 1024 -> 1024 | 3263 -> 3281 -> 3232 ( +18, -31) GFLOPS
problemSize = 1023 | A B^T | 1024 -> 1024 -> 1024 | 3245 -> 3287 -> 3237 ( +42, -8) GFLOPS
problemSize = 1024 | A B^T | 1024 -> 1024 -> 1024 | 3316 -> 3312 -> 3318 ( -4, +2) GFLOPS
problemSize = 976 | A^T B | 1024 -> 1024 -> 1024 | 3212 -> 3208 -> 3178 ( -4, -34) GFLOPS
problemSize = 977 | A^T B | 1024 -> 1024 -> 1024 | 3184 -> 3187 -> 3157 ( +3, -27) GFLOPS
problemSize = 978 | A^T B | 1024 -> 1024 -> 1024 | 3194 -> 3195 -> 3169 ( +1, -25) GFLOPS
problemSize = 979 | A^T B | 1024 -> 1024 -> 1024 | 3202 -> 3206 -> 3179 ( +4, -23) GFLOPS
problemSize = 980 | A^T B | 1024 -> 1024 -> 1024 | 3212 -> 3209 -> 3190 ( -3, -22) GFLOPS
problemSize = 981 | A^T B | 1024 -> 1024 -> 1024 | 3223 -> 3227 -> 3191 ( +4, -32) GFLOPS
problemSize = 982 | A^T B | 1024 -> 1024 -> 1024 | 3232 -> 3238 -> 3208 ( +6, -24) GFLOPS
problemSize = 983 | A^T B | 1024 -> 1024 -> 1024 | 3242 -> 3249 -> 3214 ( +7, -28) GFLOPS
problemSize = 984 | A^T B | 1024 -> 1024 -> 1024 | 3261 -> 3258 -> 3230 ( -3, -31) GFLOPS
problemSize = 985 | A^T B | 1024 -> 1024 -> 1024 | 3235 -> 3243 -> 3210 ( +8, -25) GFLOPS
problemSize = 986 | A^T B | 1024 -> 1024 -> 1024 | 3247 -> 3255 -> 3219 ( +8, -28) GFLOPS
problemSize = 987 | A^T B | 1024 -> 1024 -> 1024 | 3260 -> 3260 -> 3225 ( 0, -35) GFLOPS
problemSize = 988 | A^T B | 1024 -> 1024 -> 1024 | 3263 -> 3267 -> 3244 ( +4, -19) GFLOPS
problemSize = 989 | A^T B | 1024 -> 1024 -> 1024 | 3278 -> 3281 -> 3241 ( +3, -37) GFLOPS
problemSize = 990 | A^T B | 1024 -> 1024 -> 1024 | 3287 -> 3286 -> 3253 ( -1, -34) GFLOPS
problemSize = 991 | A^T B | 1024 -> 1024 -> 1024 | 3297 -> 3292 -> 3262 ( -5, -35) GFLOPS
problemSize = 992 | A^T B | 1024 -> 1024 -> 1024 | 3301 -> 3302 -> 3311 ( +1, +10) GFLOPS
problemSize = 993 | A^T B | 1024 -> 1024 -> 1024 | 3104 -> 3107 -> 3084 ( +3, -20) GFLOPS
problemSize = 994 | A^T B | 1024 -> 1024 -> 1024 | 3116 -> 3116 -> 3094 ( 0, -22) GFLOPS
problemSize = 995 | A^T B | 1024 -> 1024 -> 1024 | 3126 -> 3129 -> 3100 ( +3, -26) GFLOPS
problemSize = 996 | A^T B | 1024 -> 1024 -> 1024 | 3131 -> 3131 -> 3110 ( 0, -21) GFLOPS
problemSize = 997 | A^T B | 1024 -> 1024 -> 1024 | 3140 -> 3144 -> 3121 ( +4, -19) GFLOPS
problemSize = 998 | A^T B | 1024 -> 1024 -> 1024 | 3149 -> 3155 -> 3131 ( +6, -18) GFLOPS
problemSize = 999 | A^T B | 1024 -> 1024 -> 1024 | 3164 -> 3161 -> 3141 ( -3, -23) GFLOPS
problemSize = 1000 | A^T B | 1024 -> 1024 -> 1024 | 3165 -> 3166 -> 3143 ( +1, -22) GFLOPS
problemSize = 1001 | A^T B | 1024 -> 1024 -> 1024 | 3154 -> 3159 -> 3140 ( +5, -14) GFLOPS
problemSize = 1002 | A^T B | 1024 -> 1024 -> 1024 | 3169 -> 3163 -> 3147 ( -6, -22) GFLOPS
problemSize = 1003 | A^T B | 1024 -> 1024 -> 1024 | 3179 -> 3177 -> 3157 ( -2, -22) GFLOPS
problemSize = 1004 | A^T B | 1024 -> 1024 -> 1024 | 3184 -> 3186 -> 3159 ( +2, -25) GFLOPS
problemSize = 1005 | A^T B | 1024 -> 1024 -> 1024 | 3194 -> 3198 -> 3175 ( +4, -19) GFLOPS
problemSize = 1006 | A^T B | 1024 -> 1024 -> 1024 | 3203 -> 3206 -> 3188 ( +3, -15) GFLOPS
problemSize = 1007 | A^T B | 1024 -> 1024 -> 1024 | 3214 -> 3216 -> 3196 ( +2, -18) GFLOPS
problemSize = 1008 | A^T B | 1024 -> 1024 -> 1024 | 3218 -> 3218 -> 3197 ( 0, -21) GFLOPS
problemSize = 1009 | A^T B | 1024 -> 1024 -> 1024 | 3214 -> 3213 -> 3187 ( -1, -27) GFLOPS
problemSize = 1010 | A^T B | 1024 -> 1024 -> 1024 | 3220 -> 3220 -> 3189 ( 0, -31) GFLOPS
problemSize = 1011 | A^T B | 1024 -> 1024 -> 1024 | 3228 -> 3226 -> 3203 ( -2, -25) GFLOPS
problemSize = 1012 | A^T B | 1024 -> 1024 -> 1024 | 3239 -> 3238 -> 3213 ( -1, -26) GFLOPS
problemSize = 1013 | A^T B | 1024 -> 1024 -> 1024 | 3251 -> 3249 -> 3218 ( -2, -33) GFLOPS
problemSize = 1014 | A^T B | 1024 -> 1024 -> 1024 | 3257 -> 3257 -> 3229 ( 0, -28) GFLOPS
problemSize = 1015 | A^T B | 1024 -> 1024 -> 1024 | 3269 -> 3269 -> 3240 ( 0, -29) GFLOPS
problemSize = 1016 | A^T B | 1024 -> 1024 -> 1024 | 3270 -> 3268 -> 3246 ( -2, -24) GFLOPS
problemSize = 1017 | A^T B | 1024 -> 1024 -> 1024 | 3263 -> 3266 -> 3237 ( +3, -26) GFLOPS
problemSize = 1018 | A^T B | 1024 -> 1024 -> 1024 | 3274 -> 3271 -> 3243 ( -3, -31) GFLOPS
problemSize = 1019 | A^T B | 1024 -> 1024 -> 1024 | 3286 -> 3286 -> 3250 ( 0, -36) GFLOPS
problemSize = 1020 | A^T B | 1024 -> 1024 -> 1024 | 3290 -> 3294 -> 3262 ( +4, -28) GFLOPS
problemSize = 1021 | A^T B | 1024 -> 1024 -> 1024 | 3304 -> 3307 -> 3274 ( +3, -30) GFLOPS
problemSize = 1022 | A^T B | 1024 -> 1024 -> 1024 | 3311 -> 3310 -> 3281 ( -1, -30) GFLOPS
problemSize = 1023 | A^T B | 1024 -> 1024 -> 1024 | 3322 -> 3322 -> 3293 ( 0, -29) GFLOPS
problemSize = 1024 | A^T B | 1024 -> 1024 -> 1024 | 3318 -> 3317 -> 3299 ( -1, -19) GFLOPS
problemSize = 976 | A^T B^T | 1024 -> 1024 -> 1024 | 3169 -> 3169 -> 3160 ( 0, -9) GFLOPS
problemSize = 977 | A^T B^T | 1024 -> 1024 -> 1024 | 3144 -> 3151 -> 3128 ( +7, -16) GFLOPS
problemSize = 978 | A^T B^T | 1024 -> 1024 -> 1024 | 3154 -> 3155 -> 3134 ( +1, -20) GFLOPS
problemSize = 979 | A^T B^T | 1024 -> 1024 -> 1024 | 3164 -> 3169 -> 3146 ( +5, -18) GFLOPS
problemSize = 980 | A^T B^T | 1024 -> 1024 -> 1024 | 3173 -> 3181 -> 3158 ( +8, -15) GFLOPS
problemSize = 981 | A^T B^T | 1024 -> 1024 -> 1024 | 3180 -> 3184 -> 3161 ( +4, -19) GFLOPS
problemSize = 982 | A^T B^T | 1024 -> 1024 -> 1024 | 3191 -> 3195 -> 3167 ( +4, -24) GFLOPS
problemSize = 983 | A^T B^T | 1024 -> 1024 -> 1024 | 3202 -> 3200 -> 3181 ( -2, -21) GFLOPS
problemSize = 984 | A^T B^T | 1024 -> 1024 -> 1024 | 3219 -> 3219 -> 3207 ( 0, -12) GFLOPS
problemSize = 985 | A^T B^T | 1024 -> 1024 -> 1024 | 3194 -> 3196 -> 3168 ( +2, -26) GFLOPS
problemSize = 986 | A^T B^T | 1024 -> 1024 -> 1024 | 3200 -> 3215 -> 3179 ( +15, -21) GFLOPS
problemSize = 987 | A^T B^T | 1024 -> 1024 -> 1024 | 3216 -> 3219 -> 3191 ( +3, -25) GFLOPS
problemSize = 988 | A^T B^T | 1024 -> 1024 -> 1024 | 3225 -> 3232 -> 3204 ( +7, -21) GFLOPS
problemSize = 989 | A^T B^T | 1024 -> 1024 -> 1024 | 3234 -> 3232 -> 3209 ( -2, -25) GFLOPS
problemSize = 990 | A^T B^T | 1024 -> 1024 -> 1024 | 3238 -> 3244 -> 3219 ( +6, -19) GFLOPS
problemSize = 991 | A^T B^T | 1024 -> 1024 -> 1024 | 3249 -> 3254 -> 3231 ( +5, -18) GFLOPS
problemSize = 992 | A^T B^T | 1024 -> 1024 -> 1024 | 3240 -> 3241 -> 3227 ( +1, -13) GFLOPS
problemSize = 993 | A^T B^T | 1024 -> 1024 -> 1024 | 3059 -> 3057 -> 3049 ( -2, -10) GFLOPS
problemSize = 994 | A^T B^T | 1024 -> 1024 -> 1024 | 3068 -> 3067 -> 3055 ( -1, -13) GFLOPS
problemSize = 995 | A^T B^T | 1024 -> 1024 -> 1024 | 3076 -> 3079 -> 3064 ( +3, -12) GFLOPS
problemSize = 996 | A^T B^T | 1024 -> 1024 -> 1024 | 3092 -> 3087 -> 3070 ( -5, -22) GFLOPS
problemSize = 997 | A^T B^T | 1024 -> 1024 -> 1024 | 3097 -> 3100 -> 3083 ( +3, -14) GFLOPS
problemSize = 998 | A^T B^T | 1024 -> 1024 -> 1024 | 3108 -> 3106 -> 3092 ( -2, -16) GFLOPS
problemSize = 999 | A^T B^T | 1024 -> 1024 -> 1024 | 3115 -> 3115 -> 3100 ( 0, -15) GFLOPS
problemSize = 1000 | A^T B^T | 1024 -> 1024 -> 1024 | 3124 -> 3122 -> 3115 ( -2, -9) GFLOPS
problemSize = 1001 | A^T B^T | 1024 -> 1024 -> 1024 | 3110 -> 3111 -> 3090 ( +1, -20) GFLOPS
problemSize = 1002 | A^T B^T | 1024 -> 1024 -> 1024 | 3123 -> 3120 -> 3100 ( -3, -23) GFLOPS
problemSize = 1003 | A^T B^T | 1024 -> 1024 -> 1024 | 3131 -> 3129 -> 3107 ( -2, -24) GFLOPS
problemSize = 1004 | A^T B^T | 1024 -> 1024 -> 1024 | 3141 -> 3139 -> 3126 ( -2, -15) GFLOPS
problemSize = 1005 | A^T B^T | 1024 -> 1024 -> 1024 | 3146 -> 3148 -> 3126 ( +2, -20) GFLOPS
problemSize = 1006 | A^T B^T | 1024 -> 1024 -> 1024 | 3159 -> 3157 -> 3136 ( -2, -23) GFLOPS
problemSize = 1007 | A^T B^T | 1024 -> 1024 -> 1024 | 3166 -> 3168 -> 3142 ( +2, -24) GFLOPS
problemSize = 1008 | A^T B^T | 1024 -> 1024 -> 1024 | 3179 -> 3180 -> 3168 ( +1, -11) GFLOPS
problemSize = 1009 | A^T B^T | 1024 -> 1024 -> 1024 | 3162 -> 3163 -> 3145 ( +1, -17) GFLOPS
problemSize = 1010 | A^T B^T | 1024 -> 1024 -> 1024 | 3169 -> 3172 -> 3151 ( +3, -18) GFLOPS
problemSize = 1011 | A^T B^T | 1024 -> 1024 -> 1024 | 3179 -> 3177 -> 3161 ( -2, -18) GFLOPS
problemSize = 1012 | A^T B^T | 1024 -> 1024 -> 1024 | 3191 -> 3191 -> 3168 ( 0, -23) GFLOPS
problemSize = 1013 | A^T B^T | 1024 -> 1024 -> 1024 | 3203 -> 3199 -> 3181 ( -4, -22) GFLOPS
problemSize = 1014 | A^T B^T | 1024 -> 1024 -> 1024 | 3211 -> 3210 -> 3190 ( -1, -21) GFLOPS
problemSize = 1015 | A^T B^T | 1024 -> 1024 -> 1024 | 3217 -> 3219 -> 3199 ( +2, -18) GFLOPS
problemSize = 1016 | A^T B^T | 1024 -> 1024 -> 1024 | 3223 -> 3227 -> 3216 ( +4, -7) GFLOPS
problemSize = 1017 | A^T B^T | 1024 -> 1024 -> 1024 | 3213 -> 3212 -> 3193 ( -1, -20) GFLOPS
problemSize = 1018 | A^T B^T | 1024 -> 1024 -> 1024 | 3226 -> 3223 -> 3197 ( -3, -29) GFLOPS
problemSize = 1019 | A^T B^T | 1024 -> 1024 -> 1024 | 3229 -> 3229 -> 3209 ( 0, -20) GFLOPS
problemSize = 1020 | A^T B^T | 1024 -> 1024 -> 1024 | 3241 -> 3242 -> 3217 ( +1, -24) GFLOPS
problemSize = 1021 | A^T B^T | 1024 -> 1024 -> 1024 | 3253 -> 3248 -> 3229 ( -5, -24) GFLOPS
problemSize = 1022 | A^T B^T | 1024 -> 1024 -> 1024 | 3258 -> 3261 -> 3244 ( +3, -14) GFLOPS
problemSize = 1023 | A^T B^T | 1024 -> 1024 -> 1024 | 3265 -> 3268 -> 3246 ( +3, -19) GFLOPS
problemSize = 1024 | A^T B^T | 1024 -> 1024 -> 1024 | 3251 -> 3256 -> 3245 ( +5, -6) GFLOPS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment