-
-
Save cgmb/5bc00ad3f04afbdd04e2ef12d4aabe2d to your computer and use it in GitHub Desktop.
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 |
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f16_r", d_type: "f16_r", compute_type: "f16_r", transA: 'T', transB: 'N', M: 128, N: 5, K: 32, alpha: 1.0, lda: 2048, ldb: 32, beta: 0.0, ldc: 128, ldd: 128, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 } | |
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f32_r", d_type: "f32_r", compute_type: "f32_r", transA: 'T', transB: 'N', M: 32, N: 5, K: 128, alpha: 1.0, lda: 1024, ldb: 4096, beta: 0.0, ldc: 32, ldd: 32, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 } | |
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f16_r", d_type: "f16_r", compute_type: "f16_r", transA: 'T', transB: 'N', M: 128, N: 2, K: 32, alpha: 1.0, lda: 2048, ldb: 32, beta: 0.0, ldc: 128, ldd: 128, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 } | |
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f32_r", d_type: "f32_r", compute_type: "f32_r", transA: 'T', transB: 'N', M: 32, N: 2, K: 128, alpha: 1.0, lda: 1024, ldb: 4096, beta: 0.0, ldc: 32, ldd: 32, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 } |
ROCm calling rocblas_initialize as a workaround for a rocBLAS bug | |
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no | |
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no | |
ggml_cuda_init: found 1 ROCm devices: | |
Device 0: AMD Radeon RX 6800 XT, gfx1030 (0x1030), VMM: no, Wave Size: 32 | |
build: 4621 (6eecde3c) with cc (Debian 14.2.0-12) 14.2.0 for x86_64-linux-gnu | |
main: llama backend init | |
main: load the model and apply lora adapter, if any | |
llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon RX 6800 XT) - 16046 MiB free | |
llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /home/cgmb/ws/Meta-Llama-3.1-8B-Instruct-Q6_K.gguf (version GGUF V3 (latest)) | |
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
llama_model_loader: - kv 0: general.architecture str = llama | |
llama_model_loader: - kv 1: general.type str = model | |
llama_model_loader: - kv 2: general.name str = Meta Llama 3.1 8B Instruct | |
llama_model_loader: - kv 3: general.finetune str = Instruct | |
llama_model_loader: - kv 4: general.basename str = Meta-Llama-3.1 | |
llama_model_loader: - kv 5: general.size_label str = 8B | |
llama_model_loader: - kv 6: general.license str = llama3.1 | |
llama_model_loader: - kv 7: general.tags arr[str,6] = ["facebook", "meta", "pytorch", "llam... | |
llama_model_loader: - kv 8: general.languages arr[str,8] = ["en", "de", "fr", "it", "pt", "hi", ... | |
llama_model_loader: - kv 9: llama.block_count u32 = 32 | |
llama_model_loader: - kv 10: llama.context_length u32 = 131072 | |
llama_model_loader: - kv 11: llama.embedding_length u32 = 4096 | |
llama_model_loader: - kv 12: llama.feed_forward_length u32 = 14336 | |
llama_model_loader: - kv 13: llama.attention.head_count u32 = 32 | |
llama_model_loader: - kv 14: llama.attention.head_count_kv u32 = 8 | |
llama_model_loader: - kv 15: llama.rope.freq_base f32 = 500000.000000 | |
llama_model_loader: - kv 16: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 | |
llama_model_loader: - kv 17: general.file_type u32 = 18 | |
llama_model_loader: - kv 18: llama.vocab_size u32 = 128256 | |
llama_model_loader: - kv 19: llama.rope.dimension_count u32 = 128 | |
llama_model_loader: - kv 20: tokenizer.ggml.model str = gpt2 | |
llama_model_loader: - kv 21: tokenizer.ggml.pre str = llama-bpe | |
llama_model_loader: - kv 22: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... | |
llama_model_loader: - kv 23: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | |
llama_model_loader: - kv 24: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... | |
llama_model_loader: - kv 25: tokenizer.ggml.bos_token_id u32 = 128000 | |
llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 128009 | |
llama_model_loader: - kv 27: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... | |
llama_model_loader: - kv 28: general.quantization_version u32 = 2 | |
llama_model_loader: - kv 29: quantize.imatrix.file str = /models_out/Meta-Llama-3.1-8B-Instruc... | |
llama_model_loader: - kv 30: quantize.imatrix.dataset str = /training_dir/calibration_datav3.txt | |
llama_model_loader: - kv 31: quantize.imatrix.entries_count i32 = 224 | |
llama_model_loader: - kv 32: quantize.imatrix.chunks_count i32 = 125 | |
llama_model_loader: - type f32: 66 tensors | |
llama_model_loader: - type q6_K: 226 tensors | |
print_info: file format = GGUF V3 (latest) | |
print_info: file type = Q6_K | |
print_info: file size = 6.14 GiB (6.56 BPW) | |
load: special tokens cache size = 256 | |
load: token to piece cache size = 0.7999 MB | |
print_info: arch = llama | |
print_info: vocab_only = 0 | |
print_info: n_ctx_train = 131072 | |
print_info: n_embd = 4096 | |
print_info: n_layer = 32 | |
print_info: n_head = 32 | |
print_info: n_head_kv = 8 | |
print_info: n_rot = 128 | |
print_info: n_swa = 0 | |
print_info: n_embd_head_k = 128 | |
print_info: n_embd_head_v = 128 | |
print_info: n_gqa = 4 | |
print_info: n_embd_k_gqa = 1024 | |
print_info: n_embd_v_gqa = 1024 | |
print_info: f_norm_eps = 0.0e+00 | |
print_info: f_norm_rms_eps = 1.0e-05 | |
print_info: f_clamp_kqv = 0.0e+00 | |
print_info: f_max_alibi_bias = 0.0e+00 | |
print_info: f_logit_scale = 0.0e+00 | |
print_info: n_ff = 14336 | |
print_info: n_expert = 0 | |
print_info: n_expert_used = 0 | |
print_info: causal attn = 1 | |
print_info: pooling type = 0 | |
print_info: rope type = 0 | |
print_info: rope scaling = linear | |
print_info: freq_base_train = 500000.0 | |
print_info: freq_scale_train = 1 | |
print_info: n_ctx_orig_yarn = 131072 | |
print_info: rope_finetuned = unknown | |
print_info: ssm_d_conv = 0 | |
print_info: ssm_d_inner = 0 | |
print_info: ssm_d_state = 0 | |
print_info: ssm_dt_rank = 0 | |
print_info: ssm_dt_b_c_rms = 0 | |
print_info: model type = 8B | |
print_info: model params = 8.03 B | |
print_info: general.name = Meta Llama 3.1 8B Instruct | |
print_info: vocab type = BPE | |
print_info: n_vocab = 128256 | |
print_info: n_merges = 280147 | |
print_info: BOS token = 128000 '<|begin_of_text|>' | |
print_info: EOS token = 128009 '<|eot_id|>' | |
print_info: EOT token = 128009 '<|eot_id|>' | |
print_info: EOM token = 128008 '<|eom_id|>' | |
print_info: LF token = 198 'Ċ' | |
print_info: EOG token = 128008 '<|eom_id|>' | |
print_info: EOG token = 128009 '<|eot_id|>' | |
print_info: max token length = 256 | |
load_tensors: offloading 32 repeating layers to GPU | |
load_tensors: offloading output layer to GPU | |
load_tensors: offloaded 33/33 layers to GPU | |
load_tensors: ROCm0 model buffer size = 5871.99 MiB | |
load_tensors: CPU_Mapped model buffer size = 410.98 MiB | |
llama_init_from_model: n_seq_max = 1 | |
llama_init_from_model: n_ctx = 2048 | |
llama_init_from_model: n_ctx_per_seq = 2048 | |
llama_init_from_model: n_batch = 2048 | |
llama_init_from_model: n_ubatch = 512 | |
llama_init_from_model: flash_attn = 0 | |
llama_init_from_model: freq_base = 500000.0 | |
llama_init_from_model: freq_scale = 1 | |
llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized | |
llama_kv_cache_init: kv_size = 2048, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1 | |
llama_kv_cache_init: ROCm0 KV buffer size = 256.00 MiB | |
llama_init_from_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB | |
llama_init_from_model: ROCm_Host output buffer size = 0.49 MiB | |
llama_init_from_model: ROCm0 compute buffer size = 258.50 MiB | |
llama_init_from_model: ROCm_Host compute buffer size = 12.01 MiB | |
llama_init_from_model: graph nodes = 1030 | |
llama_init_from_model: graph splits = 2 | |
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048 | |
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) | |
main: llama threadpool init, n_threads = 16 | |
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 | | |
sampler seed: 2360744227 | |
sampler params: | |
repeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000 | |
dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 2048 | |
top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, temp = 0.700 | |
mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 | |
sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist | |
generate: n_ctx = 2048, n_batch = 2048, n_predict = -1, n_keep = 1 | |
llama_perf_sampler_print: sampling time = 328.84 ms / 693 runs ( 0.47 ms per token, 2107.40 tokens per second) | |
llama_perf_context_print: load time = 1635.05 ms | |
llama_perf_context_print: prompt eval time = 33.80 ms / 5 tokens ( 6.76 ms per token, 147.93 tokens per second) | |
llama_perf_context_print: eval time = 12717.05 ms / 687 runs ( 18.51 ms per token, 54.02 tokens per second) | |
llama_perf_context_print: total time = 13167.60 ms / 692 tokens |
loaded code object /lib/x86_64-linux-gnu/rocblas/2.47.0/library/Kernels.so-000-gfx1030.hsaco | |
loaded code object /lib/x86_64-linux-gnu/rocblas/2.47.0/library/TensileLibrary_gfx1030.co | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0 | |
): 0 | |
AMDGPU(matches: Processor(gfx1030): 1): 1 | |
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows) | |
TruePred: 1 | |
TruePred: 1 | |
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0): 0 | |
And(TypesEqual(a:Half == Int8&& b:Half == Int8&& c:Float == Int32&& d:Float == Int32): 0): 0 | |
AMDGPU(matches: Processor(gfx1100): 0): 0 | |
AMDGPU(matches: Processor(gfx1101): 0): 0 | |
AMDGPU(matches: Processor(gfx1102): 0): 0 | |
AMDGPU(matches: Processor(gfx803): 0): 0 | |
AMDGPU(matches: Processor(gfx900): 0): 0 | |
AMDGPU(matches: Processor(gfx906): 0): 0 | |
AMDGPU(matches: Processor(gfx908): 0): 0 | |
AMDGPU(matches: Processor(gfx90a): 0): 0 | |
TruePred: 1 | |
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows) | |
TruePred: 1 | |
TruePred: 1 | |
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == Double&& b:Half == Double&& c:Float == Double&& d:Float == Double): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == ComplexFloat&& b:Half == ComplexFloat&& c:Float == ComplexFloat&& d:Float == ComplexFloat): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == ComplexDouble&& b:Half == ComplexDouble&& c:Float == ComplexDouble&& d:Float == ComplexDouble): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(): 1 | |
TruePred: 1 | |
Object key: 32, 2, 128 | |
Key: 32, 2, 128 | |
Starting point: 127, 1, 63 | |
Rightward search... | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 63: 13251 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 63: 13251 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 64: 13122 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 64: 13122 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 65: 12995 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 65: 12995 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 127, 1: 40779 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 127, 1: 40779 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 208 | 127, 127, 63: 28875 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 208 | 127, 127, 63: 28875 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(): 1 | |
speed: 179 | 127, 127, 64: 28746 == 28746 | |
speed: 179 | 127, 127, 64: 28746 == 28746 | |
TruePred: 1And(): 1 | |
speed: 213 | 127, 127, 65: 28619 == 28619 | |
speed: 213 | 127, 127, 65: 28619 == 28619 | |
speed: 3 | 127, 128, 1: 41030 > 28619 | |
speed: 3 | 127, 128, 1: 41030 > 28619 | |
speed: 3 | 127, 129, 1: 41283 > 28619 | |
speed: 3 | 127, 129, 1: 41283 > 28619 | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 63: 13442 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 63: 13442 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 64: 13313 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 64: 13313 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 65: 13186 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 65: 13186 < 28619 <-- Best distance, but no matching solution | |
speed: 3 | 128, 127, 1: 40970 > 28619 | |
speed: 3 | 128, 127, 1: 40970 > 28619 | |
speed: 3 | 128, 128, 1: 41221 > 28619 | |
speed: 3 | 128, 128, 1: 41221 > 28619 | |
speed: 200 | 128, 128, 63: 29317 > 28619 | |
speed: 200 | 128, 128, 63: 29317 > 28619 | |
speed: 176 | 128, 128, 64: 29188 > 28619 | |
speed: 176 | 128, 128, 64: 29188 > 28619 | |
speed: 220 | 128, 128, 65: 29061 > 28619 | |
speed: 220 | 128, 128, 65: 29061 > 28619 | |
speed: 3 | 128, 129, 1: 41474 > 28619 | |
speed: 3 | 128, 129, 1: 41474 > 28619 | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 63: 13635 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 63: 13635 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 64: 13506 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 64: 13506 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 65: 13379 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 65: 13379 < 28619 <-- Best distance, but no matching solution | |
speed: 3 | 129, 127, 1: 41163 > 28619 | |
speed: 3 | 129, 127, 1: 41163 > 28619 | |
speed: 3 | 129, 128, 1: 41414 > 28619 | |
speed: 3 | 129, 128, 1: 41414 > 28619 | |
speed: 3 | 129, 129, 1: 41667 > 28619 | |
speed: 3 | 129, 129, 1: 41667 > 28619 | |
speed: 176 | 129, 129, 63: 29763 > 28619 | |
speed: 176 | 129, 129, 63: 29763 > 28619 | |
speed: 194 | 129, 129, 64: 29634 > 28619 | |
speed: 194 | 129, 129, 64: 29634 > 28619 | |
speed: 218 | 129, 129, 65: 29507 > 28619 | |
speed: 218 | 129, 129, 65: 29507 > 28619 | |
Leftward search... | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 65: 21059 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 65: 21059 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 64: 21186 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 64: 21186 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 63: 21315 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 63: 21315 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 65: 20806 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 65: 20806 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 64: 20933 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 64: 20933 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 63: 21062 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 63: 21062 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 65: 20555 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 65: 20555 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 64: 20682 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 64: 20682 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 63: 20811 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 63: 20811 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 1, 1: 17091 < 28619 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 1, 1: 17091 < 28619 <-- Best distance, but no matching solution | |
Considered 100% of entries. | |
Solution index selected: 26093 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 00 20 88 e1 7f 00 00 (0x7fe188200000) | |
[24..31] batchB: 00 01 20 88 e1 7f 00 00 (0x7fe188200100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0 | |
): 0 | |
AMDGPU(matches: Processor(gfx1030): 1): 1 | |
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows) | |
TruePred: 1 | |
TruePred: 1 | |
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Half == Float&& d:Half == Float): 0): 0 | |
And(): 1 | |
TruePred: 1 | |
Object key: 128, 2, 32 | |
Key: 128, 2, 32 | |
Starting point: 128, 64, 256 | |
Rightward search... | |
TruePred: 1And(): 1 | |
speed: 1.245 | 128, 64, 256: 54020 == 54020 | |
speed: 1.245 | 128, 64, 256: 54020 == 54020 | |
speed: 1.96 | 128, 64, 1280: 1.56135e+06 > 54020 | |
speed: 1.96 | 128, 64, 1280: 1.56135e+06 > 54020 | |
speed: 2.184 | 128, 64, 3328: 1.08675e+07 > 54020 | |
speed: 2.184 | 128, 64, 3328: 1.08675e+07 > 54020 | |
speed: 2.463 | 128, 128, 256: 66052 > 54020 | |
speed: 2.463 | 128, 128, 256: 66052 > 54020 | |
speed: 3.797 | 128, 128, 1280: 1.57338e+06 > 54020 | |
speed: 3.797 | 128, 128, 1280: 1.57338e+06 > 54020 | |
speed: 4.133 | 128, 128, 3328: 1.08795e+07 > 54020 | |
speed: 4.133 | 128, 128, 3328: 1.08795e+07 > 54020 | |
speed: 4.778 | 128, 256, 256: 114692 > 54020 | |
speed: 4.778 | 128, 256, 256: 114692 > 54020 | |
speed: 7.072 | 128, 256, 1280: 1.62202e+06 > 54020 | |
speed: 7.072 | 128, 256, 1280: 1.62202e+06 > 54020 | |
speed: 7.679 | 128, 256, 3328: 1.09281e+07 > 54020 | |
speed: 7.679 | 128, 256, 3328: 1.09281e+07 > 54020 | |
speed: 7.81 | 128, 448, 256: 249092 > 54020 | |
speed: 7.81 | 128, 448, 256: 249092 > 54020 | |
speed: 10.765 | 128, 448, 1280: 1.75642e+06 > 54020 | |
speed: 10.765 | 128, 448, 1280: 1.75642e+06 > 54020 | |
speed: 11.536 | 128, 448, 3328: 1.10625e+07 > 54020 | |
speed: 11.536 | 128, 448, 3328: 1.10625e+07 > 54020 | |
speed: 10.575 | 128, 512, 784: 825604 > 54020 | |
speed: 10.575 | 128, 512, 784: 825604 > 54020 | |
speed: 10.842 | 128, 704, 256: 542980 > 54020 | |
speed: 10.842 | 128, 704, 256: 542980 > 54020 | |
speed: 13.551 | 128, 704, 1280: 2.05031e+06 > 54020 | |
speed: 13.551 | 128, 704, 1280: 2.05031e+06 > 54020 | |
speed: 14.201 | 128, 704, 3328: 1.13564e+07 > 54020 | |
speed: 14.201 | 128, 704, 3328: 1.13564e+07 > 54020 | |
speed: 13.713 | 128, 1024, 256: 1.09466e+06 > 54020 | |
speed: 13.713 | 128, 1024, 256: 1.09466e+06 > 54020 | |
speed: 18.196 | 128, 1024, 1280: 2.60199e+06 > 54020 | |
speed: 18.196 | 128, 1024, 1280: 2.60199e+06 > 54020 | |
speed: 20.197 | 128, 1024, 3328: 1.19081e+07 > 54020 | |
speed: 20.197 | 128, 1024, 3328: 1.19081e+07 > 54020 | |
speed: 16.111 | 128, 1408, 256: 2.02701e+06 > 54020 | |
speed: 16.111 | 128, 1408, 256: 2.02701e+06 > 54020 | |
speed: 20.093 | 128, 1408, 1280: 3.53434e+06 > 54020 | |
speed: 20.093 | 128, 1408, 1280: 3.53434e+06 > 54020 | |
speed: 21.3 | 128, 1408, 3328: 1.28405e+07 > 54020 | |
speed: 21.3 | 128, 1408, 3328: 1.28405e+07 > 54020 | |
speed: 19.543 | 128, 1856, 256: 3.48749e+06 > 54020 | |
speed: 19.543 | 128, 1856, 256: 3.48749e+06 > 54020 | |
speed: 24.172 | 128, 1856, 1280: 4.99482e+06 > 54020 | |
speed: 24.172 | 128, 1856, 1280: 4.99482e+06 > 54020 | |
speed: 25.184 | 128, 1856, 3328: 1.43009e+07 > 54020 | |
speed: 25.184 | 128, 1856, 3328: 1.43009e+07 > 54020 | |
speed: 19.109 | 128, 2368, 256: 5.64813e+06 > 54020 | |
speed: 19.109 | 128, 2368, 256: 5.64813e+06 > 54020 | |
speed: 28.546 | 128, 2368, 1280: 7.15546e+06 > 54020 | |
speed: 28.546 | 128, 2368, 1280: 7.15546e+06 > 54020 | |
speed: 31.934 | 128, 2368, 3328: 1.64616e+07 > 54020 | |
speed: 31.934 | 128, 2368, 3328: 1.64616e+07 > 54020 | |
speed: 23.005 | 128, 2944, 256: 8.70554e+06 > 54020 | |
speed: 23.005 | 128, 2944, 256: 8.70554e+06 > 54020 | |
speed: 35.983 | 128, 2944, 1280: 1.02129e+07 > 54020 | |
speed: 35.983 | 128, 2944, 1280: 1.02129e+07 > 54020 | |
speed: 38.659 | 128, 2944, 3328: 1.9519e+07 > 54020 | |
speed: 38.659 | 128, 2944, 3328: 1.9519e+07 > 54020 | |
speed: 28.004 | 128, 3584, 256: 1.28809e+07 > 54020 | |
speed: 28.004 | 128, 3584, 256: 1.28809e+07 > 54020 | |
speed: 43.403 | 128, 3584, 1280: 1.43882e+07 > 54020 | |
speed: 43.403 | 128, 3584, 1280: 1.43882e+07 > 54020 | |
speed: 47.245 | 128, 3584, 3328: 2.36943e+07 > 54020 | |
speed: 47.245 | 128, 3584, 3328: 2.36943e+07 > 54020 | |
speed: 33.157 | 128, 4288, 256: 1.842e+07 > 54020 | |
speed: 33.157 | 128, 4288, 256: 1.842e+07 > 54020 | |
speed: 50.058 | 128, 4288, 1280: 1.99273e+07 > 54020 | |
speed: 50.058 | 128, 4288, 1280: 1.99273e+07 > 54020 | |
speed: 55.806 | 128, 4288, 3328: 2.92334e+07 > 54020 | |
speed: 55.806 | 128, 4288, 3328: 2.92334e+07 > 54020 | |
speed: 33.31 | 128, 5056, 256: 2.55931e+07 > 54020 | |
speed: 33.31 | 128, 5056, 256: 2.55931e+07 > 54020 | |
speed: 44.366 | 128, 5056, 1280: 2.71004e+07 > 54020 | |
speed: 44.366 | 128, 5056, 1280: 2.71004e+07 > 54020 | |
speed: 46.466 | 128, 5056, 3328: 3.64065e+07 > 54020 | |
speed: 46.466 | 128, 5056, 3328: 3.64065e+07 > 54020 | |
speed: 38.449 | 128, 5888, 256: 3.46952e+07 > 54020 | |
speed: 38.449 | 128, 5888, 256: 3.46952e+07 > 54020 | |
speed: 51.398 | 128, 5888, 1280: 3.62025e+07 > 54020 | |
speed: 51.398 | 128, 5888, 1280: 3.62025e+07 > 54020 | |
speed: 53.611 | 128, 5888, 3328: 4.55086e+07 > 54020 | |
speed: 53.611 | 128, 5888, 3328: 4.55086e+07 > 54020 | |
speed: 39.487 | 128, 6784, 256: 4.60457e+07 > 54020 | |
speed: 39.487 | 128, 6784, 256: 4.60457e+07 > 54020 | |
speed: 50.901 | 128, 6784, 1280: 4.7553e+07 > 54020 | |
speed: 50.901 | 128, 6784, 1280: 4.7553e+07 > 54020 | |
speed: 53.044 | 128, 6784, 3328: 5.68591e+07 > 54020 | |
speed: 53.044 | 128, 6784, 3328: 5.68591e+07 > 54020 | |
speed: 2.452 | 256, 64, 256: 70404 > 54020 | |
speed: 2.452 | 256, 64, 256: 70404 > 54020 | |
speed: 3.803 | 256, 64, 1280: 1.57773e+06 > 54020 | |
speed: 3.803 | 256, 64, 1280: 1.57773e+06 > 54020 | |
speed: 4.14 | 256, 64, 3136: 9.65504e+06 > 54020 | |
speed: 4.14 | 256, 64, 3136: 9.65504e+06 > 54020 | |
speed: 4.135 | 256, 64, 3328: 1.08838e+07 > 54020 | |
speed: 4.135 | 256, 64, 3328: 1.08838e+07 > 54020 | |
speed: 4.755 | 256, 128, 256: 82436 > 54020 | |
speed: 4.755 | 256, 128, 256: 82436 > 54020 | |
speed: 7.05 | 256, 128, 1280: 1.58976e+06 > 54020 | |
speed: 7.05 | 256, 128, 1280: 1.58976e+06 > 54020 | |
speed: 7.688 | 256, 128, 3328: 1.08959e+07 > 54020 | |
speed: 7.688 | 256, 128, 3328: 1.08959e+07 > 54020 | |
speed: 8.926 | 256, 256, 256: 131076 > 54020 | |
speed: 8.926 | 256, 256, 256: 131076 > 54020 | |
speed: 12.13 | 256, 256, 1280: 1.6384e+06 > 54020 | |
speed: 12.13 | 256, 256, 1280: 1.6384e+06 > 54020 | |
speed: 12.493 | 256, 256, 3328: 1.09445e+07 > 54020 | |
speed: 12.493 | 256, 256, 3328: 1.09445e+07 > 54020 | |
speed: 12.13 | 256, 448, 256: 265476 > 54020 | |
speed: 12.13 | 256, 448, 256: 265476 > 54020 | |
speed: 15.829 | 256, 448, 1280: 1.7728e+06 > 54020 | |
speed: 15.829 | 256, 448, 1280: 1.7728e+06 > 54020 | |
speed: 17.968 | 256, 448, 3328: 1.10789e+07 > 54020 | |
speed: 17.968 | 256, 448, 3328: 1.10789e+07 > 54020 | |
speed: 16.364 | 256, 704, 256: 559364 > 54020 | |
speed: 16.364 | 256, 704, 256: 559364 > 54020 | |
speed: 19.563 | 256, 704, 1280: 2.06669e+06 > 54020 | |
speed: 19.563 | 256, 704, 1280: 2.06669e+06 > 54020 | |
speed: 21.347 | 256, 704, 3328: 1.13728e+07 > 54020 | |
speed: 21.347 | 256, 704, 3328: 1.13728e+07 > 54020 | |
speed: 19.579 | 256, 1024, 196: 1.08776e+06 > 54020 | |
speed: 19.579 | 256, 1024, 196: 1.08776e+06 > 54020 | |
speed: 21.444 | 256, 1024, 256: 1.11104e+06 > 54020 | |
speed: 21.444 | 256, 1024, 256: 1.11104e+06 > 54020 | |
speed: 26.615 | 256, 1024, 1280: 2.61837e+06 > 54020 | |
speed: 26.615 | 256, 1024, 1280: 2.61837e+06 > 54020 | |
speed: 27.154 | 256, 1024, 3328: 1.19245e+07 > 54020 | |
speed: 27.154 | 256, 1024, 3328: 1.19245e+07 > 54020 | |
speed: 23.601 | 256, 1408, 256: 2.0434e+06 > 54020 | |
speed: 23.601 | 256, 1408, 256: 2.0434e+06 > 54020 | |
speed: 34.533 | 256, 1408, 1280: 3.55072e+06 > 54020 | |
speed: 34.533 | 256, 1408, 1280: 3.55072e+06 > 54020 | |
speed: 37.51 | 256, 1408, 3328: 1.28568e+07 > 54020 | |
speed: 37.51 | 256, 1408, 3328: 1.28568e+07 > 54020 | |
speed: 30.488 | 256, 1856, 256: 3.50388e+06 > 54020 | |
speed: 30.488 | 256, 1856, 256: 3.50388e+06 > 54020 | |
speed: 43.527 | 256, 1856, 1280: 5.0112e+06 > 54020 | |
speed: 43.527 | 256, 1856, 1280: 5.0112e+06 > 54020 | |
speed: 49.667 | 256, 1856, 3328: 1.43173e+07 > 54020 | |
speed: 49.667 | 256, 1856, 3328: 1.43173e+07 > 54020 | |
speed: 31.426 | 256, 2368, 256: 5.66452e+06 > 54020 | |
speed: 31.426 | 256, 2368, 256: 5.66452e+06 > 54020 | |
speed: 41.677 | 256, 2368, 1280: 7.17184e+06 > 54020 | |
speed: 41.677 | 256, 2368, 1280: 7.17184e+06 > 54020 | |
speed: 43.82 | 256, 2368, 3328: 1.6478e+07 > 54020 | |
speed: 43.82 | 256, 2368, 3328: 1.6478e+07 > 54020 | |
speed: 38.792 | 256, 2944, 256: 8.72192e+06 > 54020 | |
speed: 38.792 | 256, 2944, 256: 8.72192e+06 > 54020 | |
speed: 51.301 | 256, 2944, 1280: 1.02293e+07 > 54020 | |
speed: 51.301 | 256, 2944, 1280: 1.02293e+07 > 54020 | |
speed: 53.55 | 256, 2944, 3328: 1.95354e+07 > 54020 | |
speed: 53.55 | 256, 2944, 3328: 1.95354e+07 > 54020 | |
speed: 41.654 | 256, 3584, 256: 1.28973e+07 > 54020 | |
speed: 41.654 | 256, 3584, 256: 1.28973e+07 > 54020 | |
speed: 53.759 | 256, 3584, 1280: 1.44046e+07 > 54020 | |
speed: 53.759 | 256, 3584, 1280: 1.44046e+07 > 54020 | |
speed: 55.912 | 256, 3584, 3328: 2.37107e+07 > 54020 | |
speed: 55.912 | 256, 3584, 3328: 2.37107e+07 > 54020 | |
speed: 47.31 | 256, 4288, 256: 1.84364e+07 > 54020 | |
speed: 47.31 | 256, 4288, 256: 1.84364e+07 > 54020 | |
speed: 61.452 | 256, 4288, 1280: 1.99437e+07 > 54020 | |
speed: 61.452 | 256, 4288, 1280: 1.99437e+07 > 54020 | |
speed: 65.919 | 256, 4288, 3328: 2.92498e+07 > 54020 | |
speed: 65.919 | 256, 4288, 3328: 2.92498e+07 > 54020 | |
speed: 51.127 | 256, 5056, 256: 2.56095e+07 > 54020 | |
speed: 51.127 | 256, 5056, 256: 2.56095e+07 > 54020 | |
speed: 57.676 | 256, 5056, 1280: 2.71168e+07 > 54020 | |
speed: 57.676 | 256, 5056, 1280: 2.71168e+07 > 54020 | |
speed: 56.189 | 256, 5056, 3328: 3.64229e+07 > 54020 | |
speed: 56.189 | 256, 5056, 3328: 3.64229e+07 > 54020 | |
speed: 54.885 | 256, 5888, 256: 3.47116e+07 > 54020 | |
speed: 54.885 | 256, 5888, 256: 3.47116e+07 > 54020 | |
speed: 61.772 | 256, 5888, 1280: 3.62189e+07 > 54020 | |
speed: 61.772 | 256, 5888, 1280: 3.62189e+07 > 54020 | |
speed: 66.076 | 256, 5888, 3328: 4.5525e+07 > 54020 | |
speed: 66.076 | 256, 5888, 3328: 4.5525e+07 > 54020 | |
speed: 62.92 | 256, 6784, 256: 4.60621e+07 > 54020 | |
speed: 62.92 | 256, 6784, 256: 4.60621e+07 > 54020 | |
speed: 70.89 | 256, 6784, 1280: 4.75694e+07 > 54020 | |
speed: 70.89 | 256, 6784, 1280: 4.75694e+07 > 54020 | |
speed: 72.458 | 256, 6784, 3328: 5.68755e+07 > 54020 | |
speed: 72.458 | 256, 6784, 3328: 5.68755e+07 > 54020 | |
448, 64, 256: Stopping rightward search early. | |
Leftward search... | |
64, 6784, 3328: 5.68632e+07 > 54020 | |
64, 6784, 3328: 5.68632e+07 > 54020 | |
64, 6784, 1280: 4.75571e+07 > 54020 | |
64, 6784, 1280: 4.75571e+07 > 54020 | |
64, 6784, 256: 4.60498e+07 > 54020 | |
64, 6784, 256: 4.60498e+07 > 54020 | |
64, 5888, 3328: 4.55127e+07 > 54020 | |
64, 5888, 3328: 4.55127e+07 > 54020 | |
64, 5888, 1280: 3.62066e+07 > 54020 | |
64, 5888, 1280: 3.62066e+07 > 54020 | |
64, 5888, 256: 3.46993e+07 > 54020 | |
64, 5888, 256: 3.46993e+07 > 54020 | |
64, 5056, 3328: 3.64106e+07 > 54020 | |
64, 5056, 3328: 3.64106e+07 > 54020 | |
64, 5056, 1280: 2.71045e+07 > 54020 | |
64, 5056, 1280: 2.71045e+07 > 54020 | |
64, 5056, 256: 2.55972e+07 > 54020 | |
64, 5056, 256: 2.55972e+07 > 54020 | |
64, 4288, 3328: 2.92375e+07 > 54020 | |
64, 4288, 3328: 2.92375e+07 > 54020 | |
64, 4288, 1280: 1.99314e+07 > 54020 | |
64, 4288, 1280: 1.99314e+07 > 54020 | |
64, 4288, 256: 1.84241e+07 > 54020 | |
64, 4288, 256: 1.84241e+07 > 54020 | |
64, 3584, 3328: 2.36984e+07 > 54020 | |
64, 3584, 3328: 2.36984e+07 > 54020 | |
64, 3584, 1280: 1.43923e+07 > 54020 | |
64, 3584, 1280: 1.43923e+07 > 54020 | |
64, 3584, 256: 1.2885e+07 > 54020 | |
64, 3584, 256: 1.2885e+07 > 54020 | |
64, 2944, 3328: 1.95231e+07 > 54020 | |
64, 2944, 3328: 1.95231e+07 > 54020 | |
64, 2944, 1280: 1.0217e+07 > 54020 | |
64, 2944, 1280: 1.0217e+07 > 54020 | |
64, 2944, 256: 8.70964e+06 > 54020 | |
64, 2944, 256: 8.70964e+06 > 54020 | |
64, 2368, 3328: 1.64657e+07 > 54020 | |
64, 2368, 3328: 1.64657e+07 > 54020 | |
64, 2368, 1280: 7.15956e+06 > 54020 | |
64, 2368, 1280: 7.15956e+06 > 54020 | |
64, 2368, 256: 5.65223e+06 > 54020 | |
64, 2368, 256: 5.65223e+06 > 54020 | |
64, 1856, 3328: 1.4305e+07 > 54020 | |
64, 1856, 3328: 1.4305e+07 > 54020 | |
64, 1856, 1280: 4.99892e+06 > 54020 | |
64, 1856, 1280: 4.99892e+06 > 54020 | |
64, 1856, 256: 3.49159e+06 > 54020 | |
64, 1856, 256: 3.49159e+06 > 54020 | |
64, 1408, 3328: 1.28445e+07 > 54020 | |
64, 1408, 3328: 1.28445e+07 > 54020 | |
64, 1408, 1280: 3.53844e+06 > 54020 | |
64, 1408, 1280: 3.53844e+06 > 54020 | |
64, 1408, 256: 2.03111e+06 > 54020 | |
64, 1408, 256: 2.03111e+06 > 54020 | |
64, 1024, 3328: 1.19122e+07 > 54020 | |
64, 1024, 3328: 1.19122e+07 > 54020 | |
64, 1024, 1280: 2.60608e+06 > 54020 | |
64, 1024, 1280: 2.60608e+06 > 54020 | |
64, 1024, 256: 1.09876e+06 > 54020 | |
64, 1024, 256: 1.09876e+06 > 54020 | |
64, 704, 3328: 1.13605e+07 > 54020 | |
64, 704, 3328: 1.13605e+07 > 54020 | |
64, 704, 1280: 2.0544e+06 > 54020 | |
64, 704, 1280: 2.0544e+06 > 54020 | |
64, 704, 256: 547076 > 54020 | |
64, 704, 256: 547076 > 54020 | |
64, 448, 3328: 1.10666e+07 > 54020 | |
64, 448, 3328: 1.10666e+07 > 54020 | |
64, 448, 1280: 1.76052e+06 > 54020 | |
64, 448, 1280: 1.76052e+06 > 54020 | |
64, 448, 256: 253188 > 54020 | |
64, 448, 256: 253188 > 54020 | |
64, 256, 3328: 1.09322e+07 > 54020 | |
64, 256, 3328: 1.09322e+07 > 54020 | |
64, 256, 3136: 9.70343e+06 > 54020 | |
64, 256, 3136: 9.70343e+06 > 54020 | |
64, 256, 1280: 1.62612e+06 > 54020 | |
64, 256, 1280: 1.62612e+06 > 54020 | |
64, 256, 256: 118788 > 54020 | |
64, 256, 256: 118788 > 54020 | |
64, 128, 3328: 1.08836e+07 > 54020 | |
64, 128, 3328: 1.08836e+07 > 54020 | |
64, 128, 1280: 1.57748e+06 > 54020 | |
64, 128, 1280: 1.57748e+06 > 54020 | |
64, 128, 256: 70148 > 54020 | |
64, 128, 256: 70148 > 54020 | |
64, 64, 3328: 1.08716e+07 > 54020 | |
64, 64, 3328: 1.08716e+07 > 54020 | |
64, 64, 3136: 9.64276e+06 > 54020 | |
64, 64, 3136: 9.64276e+06 > 54020 | |
64, 64, 1280: 1.56544e+06 > 54020 | |
64, 64, 1280: 1.56544e+06 > 54020 | |
64, 64, 256: 58116 > 54020 | |
64, 64, 256: 58116 > 54020 | |
Considered 17.5329% of entries. | |
Solution index selected: 22645 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: 40 00 00 00 (64) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: 40 00 00 00 (64) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 20 00 00 (8192) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 02 00 00 00 (2) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 00 01 00 00 (256) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 00 01 00 00 (256) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: 40 00 00 00 (64) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 02 00 00 00 (2) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
Once upon a timeTensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0 | |
): 0 | |
AMDGPU(matches: Processor(gfx1030): 1): 1 | |
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows) | |
TruePred: 1 | |
TruePred: 1 | |
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0): 0 | |
And(TypesEqual(a:Half == Int8&& b:Half == Int8&& c:Float == Int32&& d:Float == Int32): 0): 0 | |
AMDGPU(matches: Processor(gfx1100): 0): 0 | |
AMDGPU(matches: Processor(gfx1101): 0): 0 | |
AMDGPU(matches: Processor(gfx1102): 0): 0 | |
AMDGPU(matches: Processor(gfx803): 0): 0 | |
AMDGPU(matches: Processor(gfx900): 0): 0 | |
AMDGPU(matches: Processor(gfx906): 0): 0 | |
AMDGPU(matches: Processor(gfx908): 0): 0 | |
AMDGPU(matches: Processor(gfx90a): 0): 0 | |
TruePred: 1 | |
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows) | |
TruePred: 1 | |
TruePred: 1 | |
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == Double&& b:Half == Double&& c:Float == Double&& d:Float == Double): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == ComplexFloat&& b:Half == ComplexFloat&& c:Float == ComplexFloat&& d:Float == ComplexFloat): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(TypesEqual(a:Half == ComplexDouble&& b:Half == ComplexDouble&& c:Float == ComplexDouble&& d:Float == ComplexDouble): 0, HighPrecisionAccumulate(0): 0): 0 | |
And(): 1 | |
TruePred: 1 | |
Object key: 32, 5, 128 | |
Key: 32, 5, 128 | |
Starting point: 127, 1, 63 | |
Rightward search... | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 63: 13266 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 63: 13266 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 64: 13137 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 64: 13137 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 65: 13010 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 1, 65: 13010 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 127, 1: 40038 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 127, 127, 1: 40038 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 208 | 127, 127, 63: 28134 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 208 | 127, 127, 63: 28134 < 1.79769e+308 <-- Best distance, but no matching solution | |
TruePred: 1And(): 1 | |
speed: 179 | 127, 127, 64: 28005 == 28005 | |
speed: 179 | 127, 127, 64: 28005 == 28005 | |
TruePred: 1And(): 1 | |
speed: 213 | 127, 127, 65: 27878 == 27878 | |
speed: 213 | 127, 127, 65: 27878 == 27878 | |
speed: 3 | 127, 128, 1: 40283 > 27878 | |
speed: 3 | 127, 128, 1: 40283 > 27878 | |
speed: 3 | 127, 129, 1: 40530 > 27878 | |
speed: 3 | 127, 129, 1: 40530 > 27878 | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 63: 13457 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 63: 13457 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 64: 13328 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 64: 13328 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 65: 13201 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 128, 1, 65: 13201 < 27878 <-- Best distance, but no matching solution | |
speed: 3 | 128, 127, 1: 40229 > 27878 | |
speed: 3 | 128, 127, 1: 40229 > 27878 | |
speed: 3 | 128, 128, 1: 40474 > 27878 | |
speed: 3 | 128, 128, 1: 40474 > 27878 | |
speed: 200 | 128, 128, 63: 28570 > 27878 | |
speed: 200 | 128, 128, 63: 28570 > 27878 | |
speed: 176 | 128, 128, 64: 28441 > 27878 | |
speed: 176 | 128, 128, 64: 28441 > 27878 | |
speed: 220 | 128, 128, 65: 28314 > 27878 | |
speed: 220 | 128, 128, 65: 28314 > 27878 | |
speed: 3 | 128, 129, 1: 40721 > 27878 | |
speed: 3 | 128, 129, 1: 40721 > 27878 | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 63: 13650 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 63: 13650 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 64: 13521 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 64: 13521 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 65: 13394 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
speed: 3 | 129, 1, 65: 13394 < 27878 <-- Best distance, but no matching solution | |
speed: 3 | 129, 127, 1: 40422 > 27878 | |
speed: 3 | 129, 127, 1: 40422 > 27878 | |
speed: 3 | 129, 128, 1: 40667 > 27878 | |
speed: 3 | 129, 128, 1: 40667 > 27878 | |
speed: 3 | 129, 129, 1: 40914 > 27878 | |
speed: 3 | 129, 129, 1: 40914 > 27878 | |
speed: 176 | 129, 129, 63: 29010 > 27878 | |
speed: 176 | 129, 129, 63: 29010 > 27878 | |
speed: 194 | 129, 129, 64: 28881 > 27878 | |
speed: 194 | 129, 129, 64: 28881 > 27878 | |
speed: 218 | 129, 129, 65: 28754 > 27878 | |
speed: 218 | 129, 129, 65: 28754 > 27878 | |
Leftward search... | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 65: 20306 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 65: 20306 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 64: 20433 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 64: 20433 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 63: 20562 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 129, 63: 20562 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 65: 20059 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 65: 20059 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 64: 20186 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 64: 20186 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 63: 20315 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 128, 63: 20315 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 65: 19814 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 65: 19814 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 64: 19941 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 64: 19941 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 63: 20070 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 127, 63: 20070 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 1, 1: 17106 < 27878 <-- Best distance, but no matching solution | |
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0 | |
1, 1, 1: 17106 < 27878 <-- Best distance, but no matching solution | |
Considered 100% of entries. | |
Solution index selected: 26093 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0 | |
): 0 | |
AMDGPU(matches: Processor(gfx1030): 1): 1 | |
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows) | |
TruePred: 1 | |
TruePred: 1 | |
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Half == Float&& d:Half == Float): 0): 0 | |
And(): 1 | |
TruePred: 1 | |
Object key: 128, 5, 32 | |
Key: 128, 5, 32 | |
Starting point: 128, 64, 256 | |
Rightward search... | |
TruePred: 1And(): 1 | |
speed: 1.245 | 128, 64, 256: 53657 == 53657 | |
speed: 1.245 | 128, 64, 256: 53657 == 53657 | |
speed: 1.96 | 128, 64, 1280: 1.56098e+06 > 53657 | |
speed: 1.96 | 128, 64, 1280: 1.56098e+06 > 53657 | |
speed: 2.184 | 128, 64, 3328: 1.08671e+07 > 53657 | |
speed: 2.184 | 128, 64, 3328: 1.08671e+07 > 53657 | |
speed: 2.463 | 128, 128, 256: 65305 > 53657 | |
speed: 2.463 | 128, 128, 256: 65305 > 53657 | |
speed: 3.797 | 128, 128, 1280: 1.57263e+06 > 53657 | |
speed: 3.797 | 128, 128, 1280: 1.57263e+06 > 53657 | |
speed: 4.133 | 128, 128, 3328: 1.08787e+07 > 53657 | |
speed: 4.133 | 128, 128, 3328: 1.08787e+07 > 53657 | |
speed: 4.778 | 128, 256, 256: 113177 > 53657 | |
speed: 4.778 | 128, 256, 256: 113177 > 53657 | |
speed: 7.072 | 128, 256, 1280: 1.6205e+06 > 53657 | |
speed: 7.072 | 128, 256, 1280: 1.6205e+06 > 53657 | |
speed: 7.679 | 128, 256, 3328: 1.09266e+07 > 53657 | |
speed: 7.679 | 128, 256, 3328: 1.09266e+07 > 53657 | |
speed: 7.81 | 128, 448, 256: 246425 > 53657 | |
speed: 7.81 | 128, 448, 256: 246425 > 53657 | |
speed: 10.765 | 128, 448, 1280: 1.75375e+06 > 53657 | |
speed: 10.765 | 128, 448, 1280: 1.75375e+06 > 53657 | |
speed: 11.536 | 128, 448, 3328: 1.10599e+07 > 53657 | |
speed: 11.536 | 128, 448, 3328: 1.10599e+07 > 53657 | |
speed: 10.575 | 128, 512, 784: 822553 > 53657 | |
speed: 10.575 | 128, 512, 784: 822553 > 53657 | |
speed: 10.842 | 128, 704, 256: 538777 > 53657 | |
speed: 10.842 | 128, 704, 256: 538777 > 53657 | |
speed: 13.551 | 128, 704, 1280: 2.0461e+06 > 53657 | |
speed: 13.551 | 128, 704, 1280: 2.0461e+06 > 53657 | |
speed: 14.201 | 128, 704, 3328: 1.13522e+07 > 53657 | |
speed: 14.201 | 128, 704, 3328: 1.13522e+07 > 53657 | |
speed: 13.713 | 128, 1024, 256: 1.08854e+06 > 53657 | |
speed: 13.713 | 128, 1024, 256: 1.08854e+06 > 53657 | |
speed: 18.196 | 128, 1024, 1280: 2.59586e+06 > 53657 | |
speed: 18.196 | 128, 1024, 1280: 2.59586e+06 > 53657 | |
speed: 20.197 | 128, 1024, 3328: 1.1902e+07 > 53657 | |
speed: 20.197 | 128, 1024, 3328: 1.1902e+07 > 53657 | |
speed: 16.111 | 128, 1408, 256: 2.01858e+06 > 53657 | |
speed: 16.111 | 128, 1408, 256: 2.01858e+06 > 53657 | |
speed: 20.093 | 128, 1408, 1280: 3.52591e+06 > 53657 | |
speed: 20.093 | 128, 1408, 1280: 3.52591e+06 > 53657 | |
speed: 21.3 | 128, 1408, 3328: 1.2832e+07 > 53657 | |
speed: 21.3 | 128, 1408, 3328: 1.2832e+07 > 53657 | |
speed: 19.543 | 128, 1856, 256: 3.47638e+06 > 53657 | |
speed: 19.543 | 128, 1856, 256: 3.47638e+06 > 53657 | |
speed: 24.172 | 128, 1856, 1280: 4.9837e+06 > 53657 | |
speed: 24.172 | 128, 1856, 1280: 4.9837e+06 > 53657 | |
speed: 25.184 | 128, 1856, 3328: 1.42898e+07 > 53657 | |
speed: 25.184 | 128, 1856, 3328: 1.42898e+07 > 53657 | |
speed: 19.109 | 128, 2368, 256: 5.63394e+06 > 53657 | |
speed: 19.109 | 128, 2368, 256: 5.63394e+06 > 53657 | |
speed: 28.546 | 128, 2368, 1280: 7.14127e+06 > 53657 | |
speed: 28.546 | 128, 2368, 1280: 7.14127e+06 > 53657 | |
speed: 31.934 | 128, 2368, 3328: 1.64474e+07 > 53657 | |
speed: 31.934 | 128, 2368, 3328: 1.64474e+07 > 53657 | |
speed: 23.005 | 128, 2944, 256: 8.6879e+06 > 53657 | |
speed: 23.005 | 128, 2944, 256: 8.6879e+06 > 53657 | |
speed: 35.983 | 128, 2944, 1280: 1.01952e+07 > 53657 | |
speed: 35.983 | 128, 2944, 1280: 1.01952e+07 > 53657 | |
speed: 38.659 | 128, 2944, 3328: 1.95013e+07 > 53657 | |
speed: 38.659 | 128, 2944, 3328: 1.95013e+07 > 53657 | |
speed: 28.004 | 128, 3584, 256: 1.28594e+07 > 53657 | |
speed: 28.004 | 128, 3584, 256: 1.28594e+07 > 53657 | |
speed: 43.403 | 128, 3584, 1280: 1.43667e+07 > 53657 | |
speed: 43.403 | 128, 3584, 1280: 1.43667e+07 > 53657 | |
speed: 47.245 | 128, 3584, 3328: 2.36729e+07 > 53657 | |
speed: 47.245 | 128, 3584, 3328: 2.36729e+07 > 53657 | |
speed: 33.157 | 128, 4288, 256: 1.83943e+07 > 53657 | |
speed: 33.157 | 128, 4288, 256: 1.83943e+07 > 53657 | |
speed: 50.058 | 128, 4288, 1280: 1.99016e+07 > 53657 | |
speed: 50.058 | 128, 4288, 1280: 1.99016e+07 > 53657 | |
speed: 55.806 | 128, 4288, 3328: 2.92077e+07 > 53657 | |
speed: 55.806 | 128, 4288, 3328: 2.92077e+07 > 53657 | |
speed: 33.31 | 128, 5056, 256: 2.55628e+07 > 53657 | |
speed: 33.31 | 128, 5056, 256: 2.55628e+07 > 53657 | |
speed: 44.366 | 128, 5056, 1280: 2.70701e+07 > 53657 | |
speed: 44.366 | 128, 5056, 1280: 2.70701e+07 > 53657 | |
speed: 46.466 | 128, 5056, 3328: 3.63762e+07 > 53657 | |
speed: 46.466 | 128, 5056, 3328: 3.63762e+07 > 53657 | |
speed: 38.449 | 128, 5888, 256: 3.46599e+07 > 53657 | |
speed: 38.449 | 128, 5888, 256: 3.46599e+07 > 53657 | |
speed: 51.398 | 128, 5888, 1280: 3.61672e+07 > 53657 | |
speed: 51.398 | 128, 5888, 1280: 3.61672e+07 > 53657 | |
speed: 53.611 | 128, 5888, 3328: 4.54733e+07 > 53657 | |
speed: 53.611 | 128, 5888, 3328: 4.54733e+07 > 53657 | |
speed: 39.487 | 128, 6784, 256: 4.6005e+07 > 53657 | |
speed: 39.487 | 128, 6784, 256: 4.6005e+07 > 53657 | |
speed: 50.901 | 128, 6784, 1280: 4.75123e+07 > 53657 | |
speed: 50.901 | 128, 6784, 1280: 4.75123e+07 > 53657 | |
speed: 53.044 | 128, 6784, 3328: 5.68185e+07 > 53657 | |
speed: 53.044 | 128, 6784, 3328: 5.68185e+07 > 53657 | |
speed: 2.452 | 256, 64, 256: 70041 > 53657 | |
speed: 2.452 | 256, 64, 256: 70041 > 53657 | |
speed: 3.803 | 256, 64, 1280: 1.57737e+06 > 53657 | |
speed: 3.803 | 256, 64, 1280: 1.57737e+06 > 53657 | |
speed: 4.14 | 256, 64, 3136: 9.65468e+06 > 53657 | |
speed: 4.14 | 256, 64, 3136: 9.65468e+06 > 53657 | |
speed: 4.135 | 256, 64, 3328: 1.08835e+07 > 53657 | |
speed: 4.135 | 256, 64, 3328: 1.08835e+07 > 53657 | |
speed: 4.755 | 256, 128, 256: 81689 > 53657 | |
speed: 4.755 | 256, 128, 256: 81689 > 53657 | |
speed: 7.05 | 256, 128, 1280: 1.58902e+06 > 53657 | |
speed: 7.05 | 256, 128, 1280: 1.58902e+06 > 53657 | |
speed: 7.688 | 256, 128, 3328: 1.08951e+07 > 53657 | |
speed: 7.688 | 256, 128, 3328: 1.08951e+07 > 53657 | |
speed: 8.926 | 256, 256, 256: 129561 > 53657 | |
speed: 8.926 | 256, 256, 256: 129561 > 53657 | |
speed: 12.13 | 256, 256, 1280: 1.63689e+06 > 53657 | |
speed: 12.13 | 256, 256, 1280: 1.63689e+06 > 53657 | |
speed: 12.493 | 256, 256, 3328: 1.0943e+07 > 53657 | |
speed: 12.493 | 256, 256, 3328: 1.0943e+07 > 53657 | |
speed: 12.13 | 256, 448, 256: 262809 > 53657 | |
speed: 12.13 | 256, 448, 256: 262809 > 53657 | |
speed: 15.829 | 256, 448, 1280: 1.77014e+06 > 53657 | |
speed: 15.829 | 256, 448, 1280: 1.77014e+06 > 53657 | |
speed: 17.968 | 256, 448, 3328: 1.10762e+07 > 53657 | |
speed: 17.968 | 256, 448, 3328: 1.10762e+07 > 53657 | |
speed: 16.364 | 256, 704, 256: 555161 > 53657 | |
speed: 16.364 | 256, 704, 256: 555161 > 53657 | |
speed: 19.563 | 256, 704, 1280: 2.06249e+06 > 53657 | |
speed: 19.563 | 256, 704, 1280: 2.06249e+06 > 53657 | |
speed: 21.347 | 256, 704, 3328: 1.13686e+07 > 53657 | |
speed: 21.347 | 256, 704, 3328: 1.13686e+07 > 53657 | |
speed: 19.579 | 256, 1024, 196: 1.08164e+06 > 53657 | |
speed: 19.579 | 256, 1024, 196: 1.08164e+06 > 53657 | |
speed: 21.444 | 256, 1024, 256: 1.10492e+06 > 53657 | |
speed: 21.444 | 256, 1024, 256: 1.10492e+06 > 53657 | |
speed: 26.615 | 256, 1024, 1280: 2.61225e+06 > 53657 | |
speed: 26.615 | 256, 1024, 1280: 2.61225e+06 > 53657 | |
speed: 27.154 | 256, 1024, 3328: 1.19184e+07 > 53657 | |
speed: 27.154 | 256, 1024, 3328: 1.19184e+07 > 53657 | |
speed: 23.601 | 256, 1408, 256: 2.03497e+06 > 53657 | |
speed: 23.601 | 256, 1408, 256: 2.03497e+06 > 53657 | |
speed: 34.533 | 256, 1408, 1280: 3.5423e+06 > 53657 | |
speed: 34.533 | 256, 1408, 1280: 3.5423e+06 > 53657 | |
speed: 37.51 | 256, 1408, 3328: 1.28484e+07 > 53657 | |
speed: 37.51 | 256, 1408, 3328: 1.28484e+07 > 53657 | |
speed: 30.488 | 256, 1856, 256: 3.49276e+06 > 53657 | |
speed: 30.488 | 256, 1856, 256: 3.49276e+06 > 53657 | |
speed: 43.527 | 256, 1856, 1280: 5.00009e+06 > 53657 | |
speed: 43.527 | 256, 1856, 1280: 5.00009e+06 > 53657 | |
speed: 49.667 | 256, 1856, 3328: 1.43062e+07 > 53657 | |
speed: 49.667 | 256, 1856, 3328: 1.43062e+07 > 53657 | |
speed: 31.426 | 256, 2368, 256: 5.65033e+06 > 53657 | |
speed: 31.426 | 256, 2368, 256: 5.65033e+06 > 53657 | |
speed: 41.677 | 256, 2368, 1280: 7.15766e+06 > 53657 | |
speed: 41.677 | 256, 2368, 1280: 7.15766e+06 > 53657 | |
speed: 43.82 | 256, 2368, 3328: 1.64638e+07 > 53657 | |
speed: 43.82 | 256, 2368, 3328: 1.64638e+07 > 53657 | |
speed: 38.792 | 256, 2944, 256: 8.70428e+06 > 53657 | |
speed: 38.792 | 256, 2944, 256: 8.70428e+06 > 53657 | |
speed: 51.301 | 256, 2944, 1280: 1.02116e+07 > 53657 | |
speed: 51.301 | 256, 2944, 1280: 1.02116e+07 > 53657 | |
speed: 53.55 | 256, 2944, 3328: 1.95177e+07 > 53657 | |
speed: 53.55 | 256, 2944, 3328: 1.95177e+07 > 53657 | |
speed: 41.654 | 256, 3584, 256: 1.28758e+07 > 53657 | |
speed: 41.654 | 256, 3584, 256: 1.28758e+07 > 53657 | |
speed: 53.759 | 256, 3584, 1280: 1.43831e+07 > 53657 | |
speed: 53.759 | 256, 3584, 1280: 1.43831e+07 > 53657 | |
speed: 55.912 | 256, 3584, 3328: 2.36892e+07 > 53657 | |
speed: 55.912 | 256, 3584, 3328: 2.36892e+07 > 53657 | |
speed: 47.31 | 256, 4288, 256: 1.84106e+07 > 53657 | |
speed: 47.31 | 256, 4288, 256: 1.84106e+07 > 53657 | |
speed: 61.452 | 256, 4288, 1280: 1.9918e+07 > 53657 | |
speed: 61.452 | 256, 4288, 1280: 1.9918e+07 > 53657 | |
speed: 65.919 | 256, 4288, 3328: 2.92241e+07 > 53657 | |
speed: 65.919 | 256, 4288, 3328: 2.92241e+07 > 53657 | |
speed: 51.127 | 256, 5056, 256: 2.55792e+07 > 53657 | |
speed: 51.127 | 256, 5056, 256: 2.55792e+07 > 53657 | |
speed: 57.676 | 256, 5056, 1280: 2.70865e+07 > 53657 | |
speed: 57.676 | 256, 5056, 1280: 2.70865e+07 > 53657 | |
speed: 56.189 | 256, 5056, 3328: 3.63926e+07 > 53657 | |
speed: 56.189 | 256, 5056, 3328: 3.63926e+07 > 53657 | |
speed: 54.885 | 256, 5888, 256: 3.46762e+07 > 53657 | |
speed: 54.885 | 256, 5888, 256: 3.46762e+07 > 53657 | |
speed: 61.772 | 256, 5888, 1280: 3.61836e+07 > 53657 | |
speed: 61.772 | 256, 5888, 1280: 3.61836e+07 > 53657 | |
speed: 66.076 | 256, 5888, 3328: 4.54897e+07 > 53657 | |
speed: 66.076 | 256, 5888, 3328: 4.54897e+07 > 53657 | |
speed: 62.92 | 256, 6784, 256: 4.60214e+07 > 53657 | |
speed: 62.92 | 256, 6784, 256: 4.60214e+07 > 53657 | |
speed: 70.89 | 256, 6784, 1280: 4.75287e+07 > 53657 | |
speed: 70.89 | 256, 6784, 1280: 4.75287e+07 > 53657 | |
speed: 72.458 | 256, 6784, 3328: 5.68348e+07 > 53657 | |
speed: 72.458 | 256, 6784, 3328: 5.68348e+07 > 53657 | |
448, 64, 256: Stopping rightward search early. | |
Leftward search... | |
64, 6784, 3328: 5.68226e+07 > 53657 | |
64, 6784, 3328: 5.68226e+07 > 53657 | |
64, 6784, 1280: 4.75164e+07 > 53657 | |
64, 6784, 1280: 4.75164e+07 > 53657 | |
64, 6784, 256: 4.60091e+07 > 53657 | |
64, 6784, 256: 4.60091e+07 > 53657 | |
64, 5888, 3328: 4.54774e+07 > 53657 | |
64, 5888, 3328: 4.54774e+07 > 53657 | |
64, 5888, 1280: 3.61713e+07 > 53657 | |
64, 5888, 1280: 3.61713e+07 > 53657 | |
64, 5888, 256: 3.4664e+07 > 53657 | |
64, 5888, 256: 3.4664e+07 > 53657 | |
64, 5056, 3328: 3.63803e+07 > 53657 | |
64, 5056, 3328: 3.63803e+07 > 53657 | |
64, 5056, 1280: 2.70742e+07 > 53657 | |
64, 5056, 1280: 2.70742e+07 > 53657 | |
64, 5056, 256: 2.55669e+07 > 53657 | |
64, 5056, 256: 2.55669e+07 > 53657 | |
64, 4288, 3328: 2.92118e+07 > 53657 | |
64, 4288, 3328: 2.92118e+07 > 53657 | |
64, 4288, 1280: 1.99057e+07 > 53657 | |
64, 4288, 1280: 1.99057e+07 > 53657 | |
64, 4288, 256: 1.83984e+07 > 53657 | |
64, 4288, 256: 1.83984e+07 > 53657 | |
64, 3584, 3328: 2.3677e+07 > 53657 | |
64, 3584, 3328: 2.3677e+07 > 53657 | |
64, 3584, 1280: 1.43708e+07 > 53657 | |
64, 3584, 1280: 1.43708e+07 > 53657 | |
64, 3584, 256: 1.28635e+07 > 53657 | |
64, 3584, 256: 1.28635e+07 > 53657 | |
64, 2944, 3328: 1.95054e+07 > 53657 | |
64, 2944, 3328: 1.95054e+07 > 53657 | |
64, 2944, 1280: 1.01993e+07 > 53657 | |
64, 2944, 1280: 1.01993e+07 > 53657 | |
64, 2944, 256: 8.69199e+06 > 53657 | |
64, 2944, 256: 8.69199e+06 > 53657 | |
64, 2368, 3328: 1.64515e+07 > 53657 | |
64, 2368, 3328: 1.64515e+07 > 53657 | |
64, 2368, 1280: 7.14537e+06 > 53657 | |
64, 2368, 1280: 7.14537e+06 > 53657 | |
64, 2368, 256: 5.63804e+06 > 53657 | |
64, 2368, 256: 5.63804e+06 > 53657 | |
64, 1856, 3328: 1.42939e+07 > 53657 | |
64, 1856, 3328: 1.42939e+07 > 53657 | |
64, 1856, 1280: 4.9878e+06 > 53657 | |
64, 1856, 1280: 4.9878e+06 > 53657 | |
64, 1856, 256: 3.48047e+06 > 53657 | |
64, 1856, 256: 3.48047e+06 > 53657 | |
64, 1408, 3328: 1.28361e+07 > 53657 | |
64, 1408, 3328: 1.28361e+07 > 53657 | |
64, 1408, 1280: 3.53001e+06 > 53657 | |
64, 1408, 1280: 3.53001e+06 > 53657 | |
64, 1408, 256: 2.02268e+06 > 53657 | |
64, 1408, 256: 2.02268e+06 > 53657 | |
64, 1024, 3328: 1.19061e+07 > 53657 | |
64, 1024, 3328: 1.19061e+07 > 53657 | |
64, 1024, 1280: 2.59996e+06 > 53657 | |
64, 1024, 1280: 2.59996e+06 > 53657 | |
64, 1024, 256: 1.09263e+06 > 53657 | |
64, 1024, 256: 1.09263e+06 > 53657 | |
64, 704, 3328: 1.13563e+07 > 53657 | |
64, 704, 3328: 1.13563e+07 > 53657 | |
64, 704, 1280: 2.0502e+06 > 53657 | |
64, 704, 1280: 2.0502e+06 > 53657 | |
64, 704, 256: 542873 > 53657 | |
64, 704, 256: 542873 > 53657 | |
64, 448, 3328: 1.1064e+07 > 53657 | |
64, 448, 3328: 1.1064e+07 > 53657 | |
64, 448, 1280: 1.75785e+06 > 53657 | |
64, 448, 1280: 1.75785e+06 > 53657 | |
64, 448, 256: 250521 > 53657 | |
64, 448, 256: 250521 > 53657 | |
64, 256, 3328: 1.09307e+07 > 53657 | |
64, 256, 3328: 1.09307e+07 > 53657 | |
64, 256, 3136: 9.70191e+06 > 53657 | |
64, 256, 3136: 9.70191e+06 > 53657 | |
64, 256, 1280: 1.6246e+06 > 53657 | |
64, 256, 1280: 1.6246e+06 > 53657 | |
64, 256, 256: 117273 > 53657 | |
64, 256, 256: 117273 > 53657 | |
64, 128, 3328: 1.08828e+07 > 53657 | |
64, 128, 3328: 1.08828e+07 > 53657 | |
64, 128, 1280: 1.57673e+06 > 53657 | |
64, 128, 1280: 1.57673e+06 > 53657 | |
64, 128, 256: 69401 > 53657 | |
64, 128, 256: 69401 > 53657 | |
64, 64, 3328: 1.08712e+07 > 53657 | |
64, 64, 3328: 1.08712e+07 > 53657 | |
64, 64, 3136: 9.64239e+06 > 53657 | |
64, 64, 3136: 9.64239e+06 > 53657 | |
64, 64, 1280: 1.56508e+06 > 53657 | |
64, 64, 1280: 1.56508e+06 > 53657 | |
64, 64, 256: 57753 > 53657 | |
64, 64, 256: 57753 > 53657 | |
Considered 17.5329% of entries. | |
Solution index selected: 22645 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8 | |
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32) | |
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100) | |
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[64..67] alpha: 00 00 80 3f (1) | |
[68..71] beta: 00 00 00 00 (0) | |
[72..75] strideD1: 20 00 00 00 (32) | |
[76..79] strideD2: a0 00 00 00 (160) | |
[80..83] strideC1: 20 00 00 00 (32) | |
[84..87] strideC2: a0 00 00 00 (160) | |
[88..91] strideA1: 00 04 00 00 (1024) | |
[92..95] strideA2: 00 80 00 00 (32768) | |
[96..99] strideB1: 00 10 00 00 (4096) | |
[100..103] strideB2: 00 50 00 00 (20480) | |
[104..107] size_0: 20 00 00 00 (32) | |
[108..111] size_1: 05 00 00 00 (5) | |
[112..115] size_2: 20 00 00 00 (32) | |
[116..119] size_3: 80 00 00 00 (128) | |
[120..123] staggerUIter: 00 00 00 00 (0) | |
[124..127] problemNumGroupTiles0: 01 00 00 00 (1) | |
[128..131] problemNumGroupTiles1: 01 00 00 00 (1) | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480 | |
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1 | |
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32) | |
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480) | |
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128) | |
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160) | |
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000) | |
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000) | |
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100) | |
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0) | |
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0) | |
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0) | |
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0) | |
[88..89] alpha: 00 3c (1) | |
[90..91] alpha_2: 00 3c (1) | |
[92..93] beta: 00 00 (0) | |
[94..95] beta_2: 00 00 (0) | |
[96..99] strideD1: 80 00 00 00 (128) | |
[100..103] strideD2: 80 02 00 00 (640) | |
[104..107] strideC1: 80 00 00 00 (128) | |
[108..111] strideC2: 80 02 00 00 (640) | |
[112..115] strideA1: 00 08 00 00 (2048) | |
[116..119] strideA2: 00 00 04 00 (262144) | |
[120..123] strideB1: 20 00 00 00 (32) | |
[124..127] strideB2: a0 00 00 00 (160) | |
[128..131] size_0: 80 00 00 00 (128) | |
[132..135] size_1: 05 00 00 00 (5) | |
[136..139] size_2: 20 00 00 00 (32) | |
[140..143] size_3: 20 00 00 00 (32) | |
[144..147] staggerUIter: 00 00 00 00 (0) | |
[148..151] problemNumGroupTiles0: 08 00 00 00 (8) | |
[152..155] problemNumGroupTiles1: 01 00 00 00 (1) | |
[156..159] numFullBlocks: 01 00 00 00 (1) | |
[160..163] wgmRemainder1: 01 00 00 00 (1) | |
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649) | |
[168..171] pad: 00 00 00 00 (0) | |
, I was on a plane flying across the country. It was one of those long flights where you have to stay awake for a while and then take a nap before getting ready for landing. As I was drifting off to sleep, I noticed that the flight attendants were preparing the cabin for meal service. | |
One of them came down the aisle with a big tray full of drinks and started pouring juice and soda into small cups. She handed me my drink and asked if I wanted any ice in it. I said yes, and she added a few ice cubes to my cup. | |
As she was finishing up her round, one of my neighbors leaned over and whispered to me, "You know what the best part of this flight is?" I shook my head, curious about what he had to say. | |
"The best part of this flight," he said with a sly grin, "is that you get to watch us pour drinks into these little cups all day long." | |
I was taken aback by his response. At first, I thought he was joking or trying to be funny. But then I started thinking about it and realized that he wasn't kidding at all. | |
He was right! The process of pouring drinks into those small cups is actually pretty fascinating. It's a delicate art that requires precision and patience. | |
As the flight attendants continued to pour drinks, I found myself mesmerized by their technique. They moved with ease and grace, carefully measuring out the perfect amount of liquid for each cup. | |
I started to notice all sorts of details about the process. The way they held the pitcher, the way they tilted it just so, the way they caught the overflow in the saucer. It was like a little dance, a choreographed routine that required years of practice to master. | |
As I sat there watching the flight attendants work their magic, I felt a sense of wonder and awe wash over me. Who knew that something as mundane as pouring drinks could be so fascinating? | |
The rest of the flight passed in a blur, but I'll never forget the lesson I learned that day: even the most ordinary tasks can hold hidden beauty and fascination. | |
And from then on, whenever I see someone pouring a drink or doing any other task that requires precision and patience, I try to appreciate the artistry involved. It's amazing how much skill and craftsmanship goes into something as simple as filling a cup with liquid! | |
--- | |
Moral of the story: Even in the most mundane tasks, there can be hidden beauty and fascination. Take time to observe and appreciate the skills and techniques that go into everyday activities. | |
Would you like to hear another story? I have plenty more where this one came from! Just let me know what kind of story you're in the mood for (e.g. funny, inspirational, thought-provoking, etc.)! | |
--- | |
Also, if you enjoyed this story, please consider sharing it with a friend or family member who might appreciate it as well! | |
**What do you think? Should I share another story soon? Let me know your thoughts!** | |
Please reply to this message to let me know what you think. If you'd like, we can discuss more about the story and its themes. Or if you have a specific topic in mind for a future story, feel free to suggest it! | |
Happy listening (or reading) and I look forward to hearing back from you! | |
Best regards, | |
[Your Name] [end of text] | |
CacheMap: 124/128 cache hits |
rocblas_create_handle,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de00000,f16_r,1024,0x7efc8de00100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed | |
rocblas_set_stream,0x5612d27fa890,atomics_allowed | |
rocblas_query_int8_layout_flag,pack_int,atomics_allowed | |
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed | |
rocblas_destroy_handle,atomics_allowed |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1 | |
./rocblas-bench -f gemm_batched_ex --transp |
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)
(Sorry about that, but we can’t show files that are this big right now.)